Merge from PG master upto d5cb3bab564e0927ffac7c8729eacf181a12dd40
authorPavan Deolasee <[email protected]>
Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)
committerPavan Deolasee <[email protected]>
Wed, 14 Jun 2017 05:42:18 +0000 (11:12 +0530)
This is the result of the "git merge remotes/PGSQL/master" upto the said commit
point. We have done some basic analysis, fixed compilation problems etc, but
bulk of the logical problems in conflict resolution etc will be handled by
subsequent commits.

508 files changed:
1  2 
.gitignore
COPYRIGHT
GNUmakefile.in
README
configure
configure.in
contrib/Makefile
contrib/citext/Makefile
contrib/citext/expected/citext_1.out
contrib/citext/sql/citext.sql
contrib/hstore/expected/hstore.out
contrib/hstore/hstore_io.c
contrib/hstore/sql/hstore.sql
contrib/ltree/expected/ltree.out
contrib/pg_stat_statements/pg_stat_statements.c
contrib/pg_trgm/expected/pg_trgm.out
contrib/sepgsql/hooks.c
doc/bug.template
doc/src/sgml/Makefile
doc/src/sgml/acronyms.sgml
doc/src/sgml/advanced.sgml
doc/src/sgml/backup.sgml
doc/src/sgml/biblio.sgml
doc/src/sgml/btree-gist.sgml
doc/src/sgml/catalogs.sgml
doc/src/sgml/citext.sgml
doc/src/sgml/config.sgml
doc/src/sgml/contrib.sgml
doc/src/sgml/datatype.sgml
doc/src/sgml/ddl.sgml
doc/src/sgml/dml.sgml
doc/src/sgml/filelist.sgml
doc/src/sgml/func.sgml
doc/src/sgml/high-availability.sgml
doc/src/sgml/indices.sgml
doc/src/sgml/info.sgml
doc/src/sgml/installation.sgml
doc/src/sgml/keywords.sgml
doc/src/sgml/legal.sgml
doc/src/sgml/libpq.sgml
doc/src/sgml/lobj.sgml
doc/src/sgml/maintenance.sgml
doc/src/sgml/mvcc.sgml
doc/src/sgml/pageinspect.sgml
doc/src/sgml/perform.sgml
doc/src/sgml/pgbuffercache.sgml
doc/src/sgml/pgfreespacemap.sgml
doc/src/sgml/pgrowlocks.sgml
doc/src/sgml/pgstatstatements.sgml
doc/src/sgml/pgstattuple.sgml
doc/src/sgml/plperl.sgml
doc/src/sgml/plpgsql.sgml
doc/src/sgml/pltcl.sgml
doc/src/sgml/postgres.sgml
doc/src/sgml/recovery-config.sgml
doc/src/sgml/ref/allfiles.sgml
doc/src/sgml/ref/alter_table.sgml
doc/src/sgml/ref/alter_user_mapping.sgml
doc/src/sgml/ref/analyze.sgml
doc/src/sgml/ref/checkpoint.sgml
doc/src/sgml/ref/copy.sgml
doc/src/sgml/ref/create_database.sgml
doc/src/sgml/ref/create_function.sgml
doc/src/sgml/ref/create_index.sgml
doc/src/sgml/ref/create_server.sgml
doc/src/sgml/ref/create_table.sgml
doc/src/sgml/ref/create_trigger.sgml
doc/src/sgml/ref/create_user_mapping.sgml
doc/src/sgml/ref/create_view.sgml
doc/src/sgml/ref/delete.sgml
doc/src/sgml/ref/drop_foreign_data_wrapper.sgml
doc/src/sgml/ref/drop_server.sgml
doc/src/sgml/ref/explain.sgml
doc/src/sgml/ref/initdb.sgml
doc/src/sgml/ref/pg_ctl-ref.sgml
doc/src/sgml/ref/pg_resetwal.sgml
doc/src/sgml/ref/pgbench.sgml
doc/src/sgml/ref/pgupgrade.sgml
doc/src/sgml/ref/update.sgml
doc/src/sgml/ref/vacuum.sgml
doc/src/sgml/ref/vacuumdb.sgml
doc/src/sgml/reference.sgml
doc/src/sgml/regress.sgml
doc/src/sgml/release.sgml
doc/src/sgml/rules.sgml
doc/src/sgml/runtime.sgml
doc/src/sgml/sourcerepo.sgml
doc/src/sgml/trigger.sgml
doc/src/sgml/wal.sgml
doc/src/sgml/xaggr.sgml
doc/src/sgml/xfunc.sgml
src/Makefile
src/Makefile.global.in
src/Makefile.shlib
src/backend/Makefile
src/backend/access/common/heaptuple.c
src/backend/access/common/printtup.c
src/backend/access/hash/hashfunc.c
src/backend/access/heap/heapam.c
src/backend/access/heap/pruneheap.c
src/backend/access/heap/tuptoaster.c
src/backend/access/rmgrdesc/smgrdesc.c
src/backend/access/rmgrdesc/xactdesc.c
src/backend/access/transam/clog.c
src/backend/access/transam/commit_ts.c
src/backend/access/transam/parallel.c
src/backend/access/transam/recovery.conf.sample
src/backend/access/transam/rmgr.c
src/backend/access/transam/slru.c
src/backend/access/transam/subtrans.c
src/backend/access/transam/transam.c
src/backend/access/transam/twophase.c
src/backend/access/transam/varsup.c
src/backend/access/transam/xact.c
src/backend/access/transam/xlog.c
src/backend/access/transam/xlogutils.c
src/backend/bootstrap/bootstrap.c
src/backend/catalog/Makefile
src/backend/catalog/catalog.c
src/backend/catalog/dependency.c
src/backend/catalog/genbki.pl
src/backend/catalog/heap.c
src/backend/catalog/index.c
src/backend/catalog/namespace.c
src/backend/catalog/objectaddress.c
src/backend/catalog/pg_proc.c
src/backend/catalog/pgxc_class.c
src/backend/catalog/storage.c
src/backend/commands/analyze.c
src/backend/commands/cluster.c
src/backend/commands/comment.c
src/backend/commands/copy.c
src/backend/commands/dbcommands.c
src/backend/commands/event_trigger.c
src/backend/commands/explain.c
src/backend/commands/extension.c
src/backend/commands/foreigncmds.c
src/backend/commands/indexcmds.c
src/backend/commands/matview.c
src/backend/commands/portalcmds.c
src/backend/commands/prepare.c
src/backend/commands/schemacmds.c
src/backend/commands/sequence.c
src/backend/commands/tablecmds.c
src/backend/commands/tablespace.c
src/backend/commands/trigger.c
src/backend/commands/vacuum.c
src/backend/commands/variable.c
src/backend/commands/view.c
src/backend/executor/Makefile
src/backend/executor/execAmi.c
src/backend/executor/execCurrent.c
src/backend/executor/execMain.c
src/backend/executor/execProcnode.c
src/backend/executor/execTuples.c
src/backend/executor/execUtils.c
src/backend/executor/functions.c
src/backend/executor/nodeAgg.c
src/backend/executor/nodeForeignscan.c
src/backend/executor/nodeModifyTable.c
src/backend/executor/nodeNestloop.c
src/backend/executor/nodeSubplan.c
src/backend/executor/nodeWindowAgg.c
src/backend/executor/spi.c
src/backend/libpq/be-fsstubs.c
src/backend/main/main.c
src/backend/nodes/bitmapset.c
src/backend/nodes/copyfuncs.c
src/backend/nodes/equalfuncs.c
src/backend/nodes/makefuncs.c
src/backend/nodes/nodeFuncs.c
src/backend/nodes/outfuncs.c
src/backend/nodes/print.c
src/backend/nodes/readfuncs.c
src/backend/optimizer/path/allpaths.c
src/backend/optimizer/path/costsize.c
src/backend/optimizer/plan/createplan.c
src/backend/optimizer/plan/planagg.c
src/backend/optimizer/plan/planner.c
src/backend/optimizer/plan/setrefs.c
src/backend/optimizer/plan/subselect.c
src/backend/optimizer/prep/prepjointree.c
src/backend/optimizer/prep/preptlist.c
src/backend/optimizer/prep/prepunion.c
src/backend/optimizer/util/pathnode.c
src/backend/optimizer/util/plancat.c
src/backend/optimizer/util/relnode.c
src/backend/parser/analyze.c
src/backend/parser/gram.y
src/backend/parser/parse_agg.c
src/backend/parser/parse_coerce.c
src/backend/parser/parse_expr.c
src/backend/parser/parse_relation.c
src/backend/parser/parse_target.c
src/backend/parser/parse_type.c
src/backend/parser/parse_utilcmd.c
src/backend/parser/parser.c
src/backend/parser/scan.l
src/backend/pgxc/cluster/pause.c
src/backend/pgxc/locator/locator.c
src/backend/pgxc/nodemgr/groupmgr.c
src/backend/pgxc/nodemgr/nodemgr.c
src/backend/pgxc/pool/execRemote.c
src/backend/pgxc/pool/pgxcnode.c
src/backend/pgxc/pool/poolmgr.c
src/backend/pgxc/squeue/squeue.c
src/backend/postmaster/autovacuum.c
src/backend/postmaster/clustermon.c
src/backend/postmaster/pgstat.c
src/backend/postmaster/postmaster.c
src/backend/replication/logical/decode.c
src/backend/replication/logical/logicalfuncs.c
src/backend/replication/syncrep.c
src/backend/rewrite/rewriteHandler.c
src/backend/rewrite/rowsecurity.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/file/fd.c
src/backend/storage/file/reinit.c
src/backend/storage/ipc/ipci.c
src/backend/storage/ipc/procarray.c
src/backend/storage/ipc/procsignal.c
src/backend/storage/lmgr/lmgr.c
src/backend/storage/lmgr/lock.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/lwlocknames.txt
src/backend/storage/lmgr/predicate.c
src/backend/storage/lmgr/proc.c
src/backend/tcop/dest.c
src/backend/tcop/postgres.c
src/backend/tcop/pquery.c
src/backend/tcop/utility.c
src/backend/utils/adt/array_userfuncs.c
src/backend/utils/adt/arrayfuncs.c
src/backend/utils/adt/date.c
src/backend/utils/adt/dbsize.c
src/backend/utils/adt/jsonb.c
src/backend/utils/adt/lockfuncs.c
src/backend/utils/adt/misc.c
src/backend/utils/adt/pseudotypes.c
src/backend/utils/adt/ri_triggers.c
src/backend/utils/adt/rowtypes.c
src/backend/utils/adt/ruleutils.c
src/backend/utils/adt/selfuncs.c
src/backend/utils/adt/timestamp.c
src/backend/utils/adt/varlena.c
src/backend/utils/adt/version.c
src/backend/utils/cache/inval.c
src/backend/utils/cache/lsyscache.c
src/backend/utils/cache/plancache.c
src/backend/utils/cache/relcache.c
src/backend/utils/cache/syscache.c
src/backend/utils/errcodes.txt
src/backend/utils/error/elog.c
src/backend/utils/init/globals.c
src/backend/utils/init/miscinit.c
src/backend/utils/init/postinit.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/backend/utils/mmgr/mcxt.c
src/backend/utils/mmgr/portalmem.c
src/backend/utils/resowner/resowner.c
src/backend/utils/sort/tuplesort.c
src/backend/utils/sort/tuplestore.c
src/backend/utils/time/combocid.c
src/backend/utils/time/snapmgr.c
src/backend/utils/time/tqual.c
src/bin/Makefile
src/bin/initdb/initdb.c
src/bin/pg_ctl/pg_ctl.c
src/bin/pg_dump/pg_backup.h
src/bin/pg_dump/pg_backup_db.c
src/bin/pg_dump/pg_dump.c
src/bin/pg_dump/pg_dump.h
src/bin/pg_dump/pg_dumpall.c
src/bin/pg_rewind/filemap.c
src/bin/pg_waldump/rmgrdesc.c
src/bin/pgbench/pgbench.c
src/bin/psql/command.c
src/bin/psql/describe.c
src/bin/psql/startup.c
src/bin/psql/tab-complete.c
src/common/Makefile
src/common/relpath.c
src/include/Makefile
src/include/access/hash.h
src/include/access/htup.h
src/include/access/rmgrlist.h
src/include/access/sysattr.h
src/include/access/transam.h
src/include/access/twophase.h
src/include/access/xact.h
src/include/access/xlog.h
src/include/bootstrap/bootstrap.h
src/include/c.h
src/include/catalog/catalog.h
src/include/catalog/dependency.h
src/include/catalog/heap.h
src/include/catalog/indexing.h
src/include/catalog/namespace.h
src/include/catalog/pg_class.h
src/include/catalog/pg_namespace.h
src/include/catalog/pg_proc.h
src/include/catalog/pg_type.h
src/include/commands/dbcommands.h
src/include/commands/explain.h
src/include/commands/prepare.h
src/include/commands/schemacmds.h
src/include/commands/sequence.h
src/include/commands/tablecmds.h
src/include/commands/trigger.h
src/include/commands/vacuum.h
src/include/commands/variable.h
src/include/common/relpath.h
src/include/executor/execdesc.h
src/include/executor/executor.h
src/include/executor/spi.h
src/include/executor/tuptable.h
src/include/libpq/libpq-be.h
src/include/miscadmin.h
src/include/nodes/bitmapset.h
src/include/nodes/execnodes.h
src/include/nodes/nodes.h
src/include/nodes/params.h
src/include/nodes/parsenodes.h
src/include/nodes/pg_list.h
src/include/nodes/plannodes.h
src/include/nodes/primnodes.h
src/include/nodes/relation.h
src/include/optimizer/cost.h
src/include/optimizer/pathnode.h
src/include/optimizer/planmain.h
src/include/optimizer/planner.h
src/include/parser/analyze.h
src/include/parser/gramparse.h
src/include/parser/kwlist.h
src/include/parser/parse_agg.h
src/include/parser/parse_func.h
src/include/parser/parse_relation.h
src/include/parser/parse_utilcmd.h
src/include/parser/parser.h
src/include/parser/scanner.h
src/include/pg_config.h.in
src/include/pg_config.h.win32
src/include/pgstat.h
src/include/port.h
src/include/postgres.h
src/include/postmaster/autovacuum.h
src/include/rewrite/rewriteHandler.h
src/include/storage/backendid.h
src/include/storage/lock.h
src/include/storage/lwlock.h
src/include/storage/proc.h
src/include/storage/procarray.h
src/include/storage/procsignal.h
src/include/storage/relfilenode.h
src/include/storage/smgr.h
src/include/tcop/dest.h
src/include/tcop/pquery.h
src/include/tcop/tcopprot.h
src/include/tcop/utility.h
src/include/utils/builtins.h
src/include/utils/elog.h
src/include/utils/guc.h
src/include/utils/guc_tables.h
src/include/utils/lsyscache.h
src/include/utils/plancache.h
src/include/utils/portal.h
src/include/utils/rel.h
src/include/utils/resowner_private.h
src/include/utils/snapshot.h
src/include/utils/syscache.h
src/include/utils/timestamp.h
src/include/utils/tuplesort.h
src/include/utils/tuplestore.h
src/interfaces/libpq/fe-auth.c
src/pl/plpgsql/src/pl_exec.c
src/pl/plpgsql/src/pl_gram.y
src/port/Makefile
src/port/getpeereid.c
src/test/regress/expected/aggregates.out
src/test/regress/expected/alter_generic.out
src/test/regress/expected/alter_table.out
src/test/regress/expected/arrays.out
src/test/regress/expected/box.out
src/test/regress/expected/brin.out
src/test/regress/expected/case.out
src/test/regress/expected/collate.out
src/test/regress/expected/combocid.out
src/test/regress/expected/copy2.out
src/test/regress/expected/create_index.out
src/test/regress/expected/create_table.out
src/test/regress/expected/create_table_like.out
src/test/regress/expected/create_view.out
src/test/regress/expected/date.out
src/test/regress/expected/enum.out
src/test/regress/expected/equivclass.out
src/test/regress/expected/event_trigger.out
src/test/regress/expected/foreign_data.out
src/test/regress/expected/foreign_key.out
src/test/regress/expected/gist.out
src/test/regress/expected/groupingsets.out
src/test/regress/expected/horology.out
src/test/regress/expected/indirect_toast.out
src/test/regress/expected/inet.out
src/test/regress/expected/inherit.out
src/test/regress/expected/insert.out
src/test/regress/expected/insert_conflict.out
src/test/regress/expected/interval.out
src/test/regress/expected/join.out
src/test/regress/expected/json.out
src/test/regress/expected/jsonb.out
src/test/regress/expected/limit.out
src/test/regress/expected/macaddr.out
src/test/regress/expected/matview.out
src/test/regress/expected/money.out
src/test/regress/expected/numeric.out
src/test/regress/expected/object_address.out
src/test/regress/expected/opr_sanity.out
src/test/regress/expected/plpgsql.out
src/test/regress/expected/polymorphism.out
src/test/regress/expected/prepared_xacts.out
src/test/regress/expected/privileges.out
src/test/regress/expected/rangefuncs.out
src/test/regress/expected/replica_identity.out
src/test/regress/expected/rolenames.out
src/test/regress/expected/rowsecurity.out
src/test/regress/expected/rowtypes.out
src/test/regress/expected/rules.out
src/test/regress/expected/sanity_check.out
src/test/regress/expected/select.out
src/test/regress/expected/select_parallel.out
src/test/regress/expected/select_views.out
src/test/regress/expected/sequence.out
src/test/regress/expected/stats.out
src/test/regress/expected/strings.out
src/test/regress/expected/subselect.out
src/test/regress/expected/tablesample.out
src/test/regress/expected/timestamptz.out
src/test/regress/expected/triggers.out
src/test/regress/expected/tsearch.out
src/test/regress/expected/txid.out
src/test/regress/expected/union.out
src/test/regress/expected/updatable_views.out
src/test/regress/expected/update.out
src/test/regress/expected/uuid.out
src/test/regress/expected/vacuum.out
src/test/regress/expected/with.out
src/test/regress/expected/xml.out
src/test/regress/expected/xml_1.out
src/test/regress/output/constraints.source
src/test/regress/output/tablespace.source
src/test/regress/parallel_schedule
src/test/regress/pg_regress.c
src/test/regress/serial_schedule
src/test/regress/sql/aggregates.sql
src/test/regress/sql/alter_table.sql
src/test/regress/sql/arrays.sql
src/test/regress/sql/box.sql
src/test/regress/sql/brin.sql
src/test/regress/sql/case.sql
src/test/regress/sql/combocid.sql
src/test/regress/sql/copy2.sql
src/test/regress/sql/create_index.sql
src/test/regress/sql/create_table.sql
src/test/regress/sql/create_table_like.sql
src/test/regress/sql/create_view.sql
src/test/regress/sql/date.sql
src/test/regress/sql/enum.sql
src/test/regress/sql/equivclass.sql
src/test/regress/sql/foreign_data.sql
src/test/regress/sql/foreign_key.sql
src/test/regress/sql/horology.sql
src/test/regress/sql/inet.sql
src/test/regress/sql/inherit.sql
src/test/regress/sql/insert.sql
src/test/regress/sql/insert_conflict.sql
src/test/regress/sql/interval.sql
src/test/regress/sql/join.sql
src/test/regress/sql/json.sql
src/test/regress/sql/jsonb.sql
src/test/regress/sql/matview.sql
src/test/regress/sql/money.sql
src/test/regress/sql/numeric.sql
src/test/regress/sql/opr_sanity.sql
src/test/regress/sql/plpgsql.sql
src/test/regress/sql/polymorphism.sql
src/test/regress/sql/prepared_xacts.sql
src/test/regress/sql/privileges.sql
src/test/regress/sql/rangefuncs.sql
src/test/regress/sql/rowsecurity.sql
src/test/regress/sql/rowtypes.sql
src/test/regress/sql/rules.sql
src/test/regress/sql/select.sql
src/test/regress/sql/select_views.sql
src/test/regress/sql/sequence.sql
src/test/regress/sql/subselect.sql
src/test/regress/sql/timestamptz.sql
src/test/regress/sql/triggers.sql
src/test/regress/sql/truncate.sql
src/test/regress/sql/tsearch.sql
src/test/regress/sql/txid.sql
src/test/regress/sql/typed_table.sql
src/test/regress/sql/union.sql
src/test/regress/sql/updatable_views.sql
src/test/regress/sql/update.sql
src/test/regress/sql/vacuum.sql
src/test/regress/sql/with.sql
src/test/regress/sql/xml.sql

diff --cc .gitignore
Simple merge
diff --cc COPYRIGHT
index aa6567a3068b90d621f6ee33fcc1990403ab4494,c320eccac08f7bec087f47efd48182eeca639d26..fa6acc3d93dac65b402d9ce47925bcadf235403e
+++ b/COPYRIGHT
@@@ -1,9 -1,7 +1,9 @@@
 -PostgreSQL Database Management System
 -(formerly known as Postgres, then as Postgres95)
 +Postgres-XL Cluster Database Management System
  
- Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 +Portions Copyright (c) 2012-2014, TransLattice, Inc.
 +Portions Copyright (c) 2010-2013, Postgres-XC Development Group
+ Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
 +Portions Copyright (c) 2015-1016, 2ndQuadrant Limited
  
  Portions Copyright (c) 1994, The Regents of the University of California
  
diff --cc GNUmakefile.in
Simple merge
diff --cc README
index 7dd5bcbb71d4e8f02e08280d29215af32c29bbe5,12de3f1d73cc9e48f804845ccab24b630c2dbd80..4cafaa535b27628823faa58a89d56ee8801ccc75
--- 1/README
--- 2/README
+++ b/README
@@@ -1,23 -1,22 +1,23 @@@
 -PostgreSQL Database Management System
 -=====================================
 +Postgres-XL Database Management System
 +======================================
  
 -This directory contains the source code distribution of the PostgreSQL
 +This directory contains the source code distribution of the Postgres-XL
  database management system.
  
 -PostgreSQL is an advanced object-relational database management system
 -that supports an extended subset of the SQL standard, including
 -transactions, foreign keys, subqueries, triggers, user-defined types
 -and functions.  This distribution also contains C language bindings.
 +Postgres-XL is an advanced object-relational cluster database management
 + system that supports an extended subset of the SQL standard, including
 +transactions, foreign keys, user-defined types and functions.  This
 +distribution also contains C language bindings.
  
 -PostgreSQL has many language interfaces, many of which are listed here:
 +Postgres-XL has many language interfaces similar to PostgreSQL, many of
 +which are listed here:
  
-       http://www.postgresql.org/download
+       https://www.postgresql.org/download
  
  See the file INSTALL for instructions on how to build and install
 -PostgreSQL.  That file also lists supported operating systems and
 +Postgres-XL.  That file also lists supported operating systems and
  hardware platforms and contains information regarding any other
 -software packages that are required to build or run the PostgreSQL
 +software packages that are required to build or run the PostgreSQL-XL
  system.  Copyright and license information can be found in the
  file COPYRIGHT.  A comprehensive documentation set is included in this
  distribution; it can be read as described in the installation
diff --cc configure
index cbfcb1815723526b8a0aa991f8a58a77b03f9f80,8208ecdb4ff5fbaa77d746a7b4c887719fd48a07..b458a5aa9a84cc08c4b5908560a4441bba3a7602
+++ b/configure
@@@ -1,8 -1,8 +1,8 @@@
  #! /bin/sh
  # Guess values for system-dependent variables and create Makefiles.
- # Generated by GNU Autoconf 2.69 for PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1).
 -# Generated by GNU Autoconf 2.69 for PostgreSQL 10beta1.
++# Generated by GNU Autoconf 2.69 for PostgreSQL 10beta1 (Postgres-XL 10alpha1).
  #
 -# Report bugs to <pgsql-bugs@postgresql.org>.
 +# Report bugs to <bugs@postgres-xl.org>.
  #
  #
  # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@@ -582,10 -582,9 +582,10 @@@ MAKEFLAGS
  # Identity of this package.
  PACKAGE_NAME='PostgreSQL'
  PACKAGE_TARNAME='postgresql'
- PACKAGE_VERSION='9.6beta4 (Postgres-XL 9.6alpha1)'
- PACKAGE_XC_VERSION='9.6alpha1'
- PACKAGE_STRING='PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1)'
 -PACKAGE_VERSION='10beta1'
 -PACKAGE_STRING='PostgreSQL 10beta1'
 -PACKAGE_BUGREPORT='[email protected]'
++PACKAGE_VERSION='10beta1 (Postgres-XL 10alpha1)'
++PACKAGE_XC_VERSION='10alpha1'
++PACKAGE_STRING='PostgreSQL 10beta1 (Postgres-XL 10alpha1)'
 +PACKAGE_BUGREPORT='[email protected]'
  PACKAGE_URL=''
  
  ac_unique_file="src/backend/access/common/heaptuple.c"
@@@ -821,9 -823,9 +826,10 @@@ with_wal_blocksiz
  with_wal_segsize
  with_CC
  enable_depend
 +enable_genmsgids
  enable_cassert
  enable_thread_safety
+ with_icu
  with_tcl
  with_tclconfig
  with_perl
@@@ -1402,7 -1408,7 +1412,7 @@@ if test "$ac_init_help" = "long"; the
    # Omit some internal or obsolete options to make the list less imposing.
    # This message is too long to be a string in the A/UX 3.1 sh.
    cat <<_ACEOF
- \`configure' configures PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1) to adapt to many kinds of systems.
 -\`configure' configures PostgreSQL 10beta1 to adapt to many kinds of systems.
++\`configure' configures PostgreSQL 10beta1 (Postgres-XL 10alpha1) to adapt to many kinds of systems.
  
  Usage: $0 [OPTION]... [VAR=VALUE]...
  
  
  if test -n "$ac_init_help"; then
    case $ac_init_help in
-      short | recursive ) echo "Configuration of PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1):";;
 -     short | recursive ) echo "Configuration of PostgreSQL 10beta1:";;
++     short | recursive ) echo "Configuration of PostgreSQL 10beta1 (Postgres-XL 10alpha1):";;
     esac
    cat <<\_ACEOF
  
  test -n "$ac_init_help" && exit $ac_status
  if $ac_init_version; then
    cat <<\_ACEOF
- PostgreSQL configure 9.6beta4 (Postgres-XL 9.6alpha1)
 -PostgreSQL configure 10beta1
++PostgreSQL configure 10beta1 (Postgres-XL 10alpha1)
  generated by GNU Autoconf 2.69
  
  Copyright (C) 2012 Free Software Foundation, Inc.
@@@ -2330,7 -2343,7 +2347,7 @@@ cat >config.log <<_ACEO
  This file contains any messages produced by compilers while
  running configure, to aid debugging if configure makes a mistake.
  
- It was created by PostgreSQL $as_me 9.6beta4 (Postgres-XL 9.6alpha1), which was
 -It was created by PostgreSQL $as_me 10beta1, which was
++It was created by PostgreSQL $as_me 10beta1 (Postgres-XL 10alpha1), which was
  generated by GNU Autoconf 2.69.  Invocation command line was
  
    $ $0 $@
@@@ -16483,7 -17019,7 +17069,7 @@@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_wr
  # report actual input values of CONFIG_FILES etc. instead of their
  # values after options handling.
  ac_log="
- This file was extended by PostgreSQL $as_me 9.6beta4 (Postgres-XL 9.6alpha1), which was
 -This file was extended by PostgreSQL $as_me 10beta1, which was
++This file was extended by PostgreSQL $as_me 10beta1 (Postgres-XL 10alpha1), which was
  generated by GNU Autoconf 2.69.  Invocation command line was
  
    CONFIG_FILES    = $CONFIG_FILES
@@@ -16553,7 -17089,7 +17139,7 @@@ _ACEO
  cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
  ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
  ac_cs_version="\\
- PostgreSQL config.status 9.6beta4 (Postgres-XL 9.6alpha1)
 -PostgreSQL config.status 10beta1
++PostgreSQL config.status 10beta1 (Postgres-XL 10alpha1)
  configured by $0, generated by GNU Autoconf 2.69,
    with options \\"\$ac_cs_config\\"
  
diff --cc configure.in
index 3c77bebcdce6385571f759e85f3fc6fe1e1cc4a5,e9f85b805dae870dd6a77f9bac6a266c03e14e73..e1b1780a5fd6230f70f8148a621da44319264d67
@@@ -17,7 -17,7 +17,7 @@@ dnl Read the Autoconf manual for detail
  dnl
  m4_pattern_forbid(^PGAC_)dnl to catch undefined macros
  
- AC_INIT([PostgreSQL], [9.6beta4 (Postgres-XL 9.6alpha1)], [[email protected]])
 -AC_INIT([PostgreSQL], [10beta1], [[email protected]])
++AC_INIT([PostgreSQL], [10beta1 (Postgres-XL 10alpha1)], [[email protected]])
  
  m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
  Untested combinations of 'autoconf' and PostgreSQL versions are not
index fedc61b243eb9fe4279df2d55e9e3f8cc1f3f13f,e84eb67008032e4c53df45a92876ca32903a89d9..d2503412701d67ba1b7f44e67a744d0747f8cd30
@@@ -50,10 -48,8 +51,9 @@@ SUBDIRS = 
                test_decoding   \
                tsm_system_rows \
                tsm_system_time \
-               tsearch2        \
                unaccent        \
 -              vacuumlo
 +              vacuumlo        \
 +              stormstats
  
  ifeq ($(with_openssl),yes)
  SUBDIRS += sslinfo
index 3623f9d91ce695d685d70d015ecf9c676b8e1d83,563cd22dcccbb53222ea42aea499e6498e904b8d..a6e026dafb4b2ebf83ac29e67b6ff192bc651306
mode 100755,100644..100755
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index cc56d80bb56097e7a144efac00f62b73ca333efc,bf03e67513f262133fa6882892d8975c9ad42227..3c35604b5d665d0e623463408a6bb84b2ca7d801
@@@ -289,20 -294,18 +294,22 @@@ static void pgss_post_parse_analyze(Par
  static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
  static void pgss_ExecutorRun(QueryDesc *queryDesc,
                                 ScanDirection direction,
-                                uint64 count);
+                                uint64 count, bool execute_once);
  static void pgss_ExecutorFinish(QueryDesc *queryDesc);
  static void pgss_ExecutorEnd(QueryDesc *queryDesc);
- static void pgss_ProcessUtility(Node *parsetree, const char *queryString,
+ static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
                                        ProcessUtilityContext context, ParamListInfo params,
 -                                      DestReceiver *dest, char *completionTag);
+                                       QueryEnvironment *queryEnv,
 +                                      DestReceiver *dest,
 +#ifdef PGXC
 +                                      bool sentToRemote,
 +#endif /* PGXC */
 +                                      char *completionTag);
  static uint32 pgss_hash_fn(const void *key, Size keysize);
  static int    pgss_match_fn(const void *key1, const void *key2, Size keysize);
- static uint32 pgss_hash_string(const char *str);
+ static uint32 pgss_hash_string(const char *str, int len);
  static void pgss_store(const char *query, uint32 queryId,
+                  int query_location, int query_len,
                   double total_time, uint64 rows,
                   const BufferUsage *bufusage,
                   pgssJumbleState *jstate);
@@@ -946,14 -956,13 +960,17 @@@ pgss_ExecutorEnd(QueryDesc *queryDesc
   * ProcessUtility hook
   */
  static void
- pgss_ProcessUtility(Node *parsetree, const char *queryString,
+ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 -                                      ProcessUtilityContext context,
 -                                      ParamListInfo params, QueryEnvironment *queryEnv,
 -                                      DestReceiver *dest, char *completionTag)
 +                                      ProcessUtilityContext context, ParamListInfo params,
++                                      QueryEnvironment *queryEnv,
 +                                      DestReceiver *dest,
 +#ifdef PGXC
 +                                      bool sentToRemote,
 +#endif /* PGXC */
 +                                      char *completionTag)
  {
+       Node       *parsetree = pstmt->utilityStmt;
        /*
         * If it's an EXECUTE statement, we don't track it and don't increment the
         * nesting level.  This allows the cycles to be charged to the underlying
                PG_TRY();
                {
                        if (prev_ProcessUtility)
-                               prev_ProcessUtility(parsetree, queryString,
-                                                                       context, params,
+                               prev_ProcessUtility(pstmt, queryString,
+                                                                       context, params, queryEnv,
 -                                                                      dest, completionTag);
 +                                                                      dest,
- #ifdef PGXC
 +                                                                      sentToRemote,
- #endif /* PGXC */
 +                                                                      completionTag);
                        else
-                               standard_ProcessUtility(parsetree, queryString,
-                                                                               context, params,
+                               standard_ProcessUtility(pstmt, queryString,
+                                                                               context, params, queryEnv,
 -                                                                              dest, completionTag);
 +                                                                              dest,
- #ifdef PGXC
 +                                                                              sentToRemote,
- #endif /* PGXC */
 +                                                                              completionTag);
                        nested_level--;
                }
                PG_CATCH();
        else
        {
                if (prev_ProcessUtility)
-                       prev_ProcessUtility(parsetree, queryString,
-                                                               context, params,
+                       prev_ProcessUtility(pstmt, queryString,
+                                                               context, params, queryEnv,
 -                                                              dest, completionTag);
 +                                                              dest,
- #ifdef PGXC
 +                                                              sentToRemote,
- #endif /* PGXC */
 +                                                              completionTag);
                else
-                       standard_ProcessUtility(parsetree, queryString,
-                                                                       context, params,
+                       standard_ProcessUtility(pstmt, queryString,
+                                                                       context, params, queryEnv,
 -                                                                      dest, completionTag);
 +                                                                      dest,
- #ifdef PGXC
 +                                                                      sentToRemote,
- #endif /* PGXC */
 +                                                                      completionTag);
        }
  }
  
Simple merge
index 41193222385a99e30a13a92993192dbf97da51bb,c4b978b48f2c830e3b1177b3e5dc5fb7d16edbd5..dadf99e74b60c05f7a44fa902509bf22d182ca34
@@@ -301,12 -302,11 +302,14 @@@ sepgsql_utility_command(PlannedStmt *ps
                                                const char *queryString,
                                                ProcessUtilityContext context,
                                                ParamListInfo params,
+                                               QueryEnvironment *queryEnv,
                                                DestReceiver *dest,
 +#ifdef PGXC
 +                                              bool sentToRemote,
 +#endif /* PGXC */
                                                char *completionTag)
  {
+       Node       *parsetree = pstmt->utilityStmt;
        sepgsql_context_info_t saved_context_info = sepgsql_context_info;
        ListCell   *cell;
  
                }
  
                if (next_ProcessUtility_hook)
-                       (*next_ProcessUtility_hook) (parsetree, queryString,
-                                                                                context, params,
+                       (*next_ProcessUtility_hook) (pstmt, queryString,
+                                                                                context, params, queryEnv,
 -                                                                               dest, completionTag);
 +                                                                               dest,
- #ifdef PGXC
 +                                                                               sentToRemote,
- #endif
 +                                                                               completionTag);
                else
-                       standard_ProcessUtility(parsetree, queryString,
-                                                                       context, params,
+                       standard_ProcessUtility(pstmt, queryString,
+                                                                       context, params, queryEnv,
 -                                                                      dest, completionTag);
 +                                                                      dest,
- #ifdef PGXC
 +                                                                      sentToRemote,
- #endif
 +                                                                      completionTag);
        }
        PG_CATCH();
        {
index b0ec8a3f02c4a7c68470e9af3073d5ebfbd9f0f3,70ce03163aff1cb2caa6248fe4a3a1f893998387..d4d5f4d4f2006d5e1b69430bdf3d3474ce24b4f5
@@@ -27,7 -27,7 +27,7 @@@ System Configuration
  
    Operating System (example: Linux 2.4.18)    :
  
-   PostgreSQL version (example: PostgreSQL 9.6beta4):  Postgres-XL 9.6alpha1
 -  PostgreSQL version (example: PostgreSQL 10beta1):  PostgreSQL 10beta1
++  PostgreSQL version (example: PostgreSQL 10beta1):  Postgres-XL 10alpha1
  
    Compiler used (example: gcc 3.3.5)          :
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index f7841719831b4232f8ae90c0e2a833914fa88ee7,84c4f209909072a755c993eaf2ba53e16f6eec6f..71339bf81de82c543b54a853487f6681c1b14909
mode 100755,100644..100755
Simple merge
index eba66890885cf249492788fb580fcf77bc93a1de,b914086009f619c66d732209a2048b36869cef7d..37bddc7f4e88fe28dfb437ebe056b5941b6ff93a
@@@ -49,8 -47,7 +47,9 @@@
  <!ENTITY config        SYSTEM "config.sgml">
  <!ENTITY user-manag    SYSTEM "user-manag.sgml">
  <!ENTITY wal           SYSTEM "wal.sgml">
 +<!ENTITY add-node      SYSTEM "add-node.sgml">
 +<!ENTITY remove-node   SYSTEM "remove-node.sgml">
+ <!ENTITY logical-replication    SYSTEM "logical-replication.sgml">
  
  <!-- programmer's guide -->
  <!ENTITY bgworker   SYSTEM "bgworker.sgml">
  <!ENTITY sourcerepo SYSTEM "sourcerepo.sgml">
  
  <!ENTITY release    SYSTEM "release.sgml">
+ <!ENTITY release-10     SYSTEM "release-10.sgml">
  <!ENTITY release-9.6    SYSTEM "release-9.6.sgml">
 +<!ENTITY release-xl-9.5r1    SYSTEM "release-xl-9.5r1.sgml">
  <!ENTITY release-9.5    SYSTEM "release-9.5.sgml">
  <!ENTITY release-9.4    SYSTEM "release-9.4.sgml">
  <!ENTITY release-9.3    SYSTEM "release-9.3.sgml">
index 22f52ce256bab8041eca8faa07ed1e04dd07792a,14aae736c3427bc9581f3d64f4599e6462030729..7c5cbab2a2c06adace01074114c4585970ec4741
@@@ -18103,15 -18666,9 +18708,15 @@@ postgres=# SELECT * FROM pg_walfile_nam
      needs to be archived.
     </para>
  
 +   <para>
 +    Please note that these functions works just locally.  To issue
 +    these functions to another Coordinators or Datanodes, you should
 +    issue these functions through <type>EXECUTE DIRECT</> statement.
 +   </para>
 +
     <para>
-     <function>pg_xlog_location_diff</> calculates the difference in bytes
-     between two transaction log locations. It can be used with
+     <function>pg_wal_lsn_diff</> calculates the difference in bytes
+     between two write-ahead log locations. It can be used with
      <structname>pg_stat_replication</structname> or some functions shown in
      <xref linkend="functions-admin-backup-table"> to get the replication lag.
     </para>
Simple merge
Simple merge
index b1738a75a7a5b47b9591c6ad8f129abb2ff6bd98,233ba0e6687408c409dd5d2d7449ad4e0ae0079c..4e5503c3c6065964cd4286b9b358be14c10eed87
      <term>Web Site</term>
      <listitem>
       <para>
-       The <productname>Postgres-XL</productname> 
-       <ulink url="http://www.postgres-xl.org/">web site</ulink>
 -      The <productname>PostgreSQL</productname>
 -      <ulink url="https://www.postgresql.org">web site</ulink>
++      The <productname>Postgres-XL</productname>
++      <ulink url="http://www.postgres-xl.org">web site</ulink>
        carries details on the latest release and other
        information to make your work or play with
 -      <productname>PostgreSQL</productname> more productive.
 +      <productname>Postgres-XL</productname> more productive.
       </para>
      </listitem>
     </varlistentry>
Simple merge
Simple merge
index ae7f5a7d317f198d9409c40f3824a30e452f94aa,67ef88b2ff9c79cc184a2fd99463680feac93cd0..601d063a3a2ff2f5b3434b73ebafdba7be7a8317
@@@ -1,27 -1,11 +1,27 @@@
  <!-- doc/src/sgml/legal.sgml -->
  
- <date>2016</date>
+ <date>2017</date>
  
  <copyright>
-  <year>1996-2016</year>
+  <year>1996-2017</year>
   <holder>The PostgreSQL Global Development Group</holder>
  </copyright>
 +<copyright>
 + <year>2014-2016</year>
 + <holder>Postgres-XL Development Group</holder>
 +</copyright>
 +<copyright>
 + <year>2009-2012</year>
 + <holder>Postgres-XC Development Group</holder>
 +</copyright>
 +<copyright>
 + <year>2012-2014</year>
 + <holder>TransLattice, Inc.</holder>
 +</copyright>
 +<copyright>
 + <year>2015-2016</year>
 + <holder>2ndQuadrant Ltd</holder>
 +</copyright>
  
  <legalnotice id="legalnotice">
   <title>Legal Notice</title>
Simple merge
Simple merge
index fbab22e715d94268593e21075941beb6ffb3c3b9,65a64c85ec012452c999c7fdba6868b9a9e4c0de..cfaa0da4b836d0aaa1a853f7f526b6fc2e8e42c4
       <secondary>of transaction IDs</secondary>
      </indexterm>
  
 +   <para>
 +    Please note that this section describes the tasks of individual
 +    Coordinators and Datanodes.  It should be done for each of them.
 +   </para>
 +
     <para>
-     <productname>PostgreSQL</productname>'s MVCC transaction semantics
+     <productname>PostgreSQL</productname>'s
+     <link linkend="mvcc-intro">MVCC</link> transaction semantics
      depend on being able to compare transaction ID (<acronym>XID</>)
      numbers: a row version with an insertion XID greater than the current
      transaction's XID is <quote>in the future</> and should not be visible
Simple merge
index 96ac82afef1040d53ee9870cef9c9e921c668516,ccdaf3e0aca3b699f0af0e452134a90e52e47f52..3809c4e7f9256c531137bd86ae1f4c6b9a480221
    debugging purposes.  All of these functions may be used only by superusers.
   </para>
  
 + <para>
 +  Functions of this module returns information about connecting Coordinators
 +  locally.  To get information from a specific a Datanode, you can use EXECUTE
 +  DIRECT from a Coordinator. 
 + </para>
 +
   <sect2>
-   <title>Functions</title>
+   <title>General Functions</title>
  
    <variablelist>
     <varlistentry>
Simple merge
index 6c5be94a4c360a1b4b530d684692c60aab5fac10,4e53009ae073b69b1545b48141e5a511554ec3e4..2e26d9e202429e0795ee6f364d3fd747ea75f198
   </para>
  
   <para>
-   By default public access is revoked from both of these, just in case there
-   are security issues lurking.
+   By default use is restricted to superusers and members of the
+   <literal>pg_read_all_stats</literal> role. Access may be granted to others
+   using <command>GRANT</command>.
   </para>
  
 + <para>
 +  <filename>pg_buffercache</filename> returns information local to the
 +  connecting Coordinator.  To inquire information local to other node,
 +  use <command>EXECUTE DIRECT</command>.
 + </para>
 +
   <sect2>
    <title>The <structname>pg_buffercache</structname> View</title>
  
index a88cd52678757263fa0e05e42115449dfe4f53bd,43e154a2f3aa7fcc9f70957294054143879e0ecc..924d512f6a4b41eb9fafb356bdf9ee0f55a4555a
   </para>
  
   <para>
-   By default public access is revoked from the functions, just in case
-   there are security issues lurking.
+   By default use is restricted to superusers and members of the
+   <literal>pg_stat_scan_tables</literal> role. Access may be granted to others
+   using <command>GRANT</command>.
   </para>
  
 + <para>
 +  Functions of this module return information from the Coordinator that the
 +  session is currently connected to.  To get information about a Datanode, you
 +  can use <command>EXECUTE DIRECT</command>.
 + </para>
 +
   <sect2>
    <title>Functions</title>
  
index 4fae2fad98dde9b091627c4e590321ad32a5d01a,65d532e081041dac7162af172face346584997ef..b0cde6d01a55b1fd4ed37c27107ffedacaf07e51
    locking information for a specified table.
   </para>
  
+  <para>
+   By default use is restricted to superusers, members of the
+   <literal>pg_stat_scan_tables</literal> role, and users with
+   <literal>SELECT</literal> permissions on the table.
+  </para>
 + <para>
 +  Functions of this module return information from the 
 +  Coordinator that the session is currently connect to.  
 +  To get information about a Datanode, you can
 +  use <command>EXECUTE DIRECT</command>.
 + </para>
  
   <sect2>
    <title>Overview</title>
Simple merge
index b5e2ea7187ff0f5d5df92b372e6d7336342c98a5,b6a5f19e706ba8d692a0f070add2292172b57c95..63412b03ef6e92062d3a27282a5e96f749744994
    obtain tuple-level statistics.
   </para>
  
+  <para>
+   As these functions return detailed page-level information, only the superuser
+   has EXECUTE privileges on them upon installation.  After the functions have
+   been installed, users may issue <command>GRANT</command> commands to change
+   the privileges on the functions to allow non-superusers to execute them. Members
+   of the <literal>pg_stat_scan_tables</literal> role are granted access by default. See
+   the description of the <xref linkend="sql-grant"> command for specifics.
+  </para>
 + <para>
 +  Functions of this module return information from the Coordinator that the
 +  session is currently connected to.  To get information about a Datanode, you
 +  can use <command>EXECUTE DIRECT</command>.
 + </para>
  
   <sect2>
    <title>Functions</title>
Simple merge
Simple merge
Simple merge
Simple merge
index e04a157cc5957451b8e5523045f8b764667b187b,0a5d086248c37309401e2b8dfcee97bf94a9a66d..4797156eddfb429b5f4169e03ffe0cc6da7e5df9
@@@ -157,9 -157,10 +157,9 @@@ restore_command = 'copy "C:\\server\\ar
        By default, recovery will recover to the end of the WAL log. The
        following parameters can be used to specify an earlier stopping point.
        At most one of <varname>recovery_target</>,
-       <varname>recovery_target_name</>, <varname>recovery_target_time</>,
 -      <varname>recovery_target_lsn</>, <varname>recovery_target_name</>,
 -      <varname>recovery_target_time</>, or <varname>recovery_target_xid</>
 -      can be used; if more than one of these is specified in the configuration
 -      file, the last entry will be used.
++      <varname>recovery_target_lsn</>,  <varname>recovery_target_name</>, <varname>recovery_target_time</>,
 +      <varname>recovery_target_xid</> or <varname>recovery_target_barrier</> can be used; if more than one of these
 +      is specified in the configuration file, the last entry will be used.
       </para>
  
       <variablelist>
         </para>
        </listitem>
       </varlistentry>
 +     <varlistentry id="recovery-target-barrier" xreflabel="recovery_target_barrier">
 +      <term><varname>recovery_target_barrier</varname> (<type>string</type>)
 +       <indexterm>
 +         <primary><varname>recovery_target_barrier</> recovery parameter</primary>
 +       </indexterm>
 +      </term>
 +      <listitem>
 +       <para>
 +        This parameter specifies the barrier ID up to which recovery
 +        will proceed. A global consistency is guaranteed when recovery is
 +        stopped at a previously successfully completed barrier. At most
 +        one of <varname>recovery_target_xid</>,
 +        <xref linkend="recovery-target-time"> and 
 +        <varname>recovery_target_barrier</> can be specified.
++     </varlistentry>
+      <varlistentry id="recovery-target-lsn" xreflabel="recovery_target_lsn">
+       <term><varname>recovery_target_lsn</varname> (<type>pg_lsn</type>)
+       <indexterm>
+         <primary><varname>recovery_target_lsn</> recovery parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         This parameter specifies the LSN of the write-ahead log location up
+         to which recovery will proceed. The precise stopping point is also
+         influenced by <xref linkend="recovery-target-inclusive">. This
+         parameter is parsed using the system data type
+         <link linkend="datatype-pg-lsn"><type>pg_lsn</></link>.
         </para>
        </listitem>
       </varlistentry>
index 0c5e3b350e0f23ad0267e2fda4deadd41b931b16,01acc2ef9dad1986af16c7b4e46afbeb22f4c6c8..3ef0b5af791ad50c841f65edd45adb19427f9224
@@@ -186,17 -185,11 +195,15 @@@ Complete list of usable sgml source fil
  <!-- applications and utilities -->
  <!ENTITY clusterdb          SYSTEM "clusterdb.sgml">
  <!ENTITY createdb           SYSTEM "createdb.sgml">
- <!ENTITY createlang         SYSTEM "createlang.sgml">
  <!ENTITY createuser         SYSTEM "createuser.sgml">
  <!ENTITY dropdb             SYSTEM "dropdb.sgml">
- <!ENTITY droplang           SYSTEM "droplang.sgml">
  <!ENTITY dropuser           SYSTEM "dropuser.sgml">
  <!ENTITY ecpgRef            SYSTEM "ecpg-ref.sgml">
 +<!ENTITY gtm                system "gtm.sgml">
 +<!ENTITY gtmPxy             system "gtm_proxy.sgml">
 +<!ENTITY gtmCtl             system "gtm_ctl.sgml">
  <!ENTITY initdb             SYSTEM "initdb.sgml">
 +<!ENTITY initgtm            SYSTEM "initgtm.sgml">
  <!ENTITY pgarchivecleanup   SYSTEM "pgarchivecleanup.sgml">
  <!ENTITY pgBasebackup       SYSTEM "pg_basebackup.sgml">
  <!ENTITY pgbench            SYSTEM "pgbench.sgml">
index 8deb80ab63c052eb7eac0c4aad3866437736b070,56ea830d413f83c76b7398113a3d13bfbf4b8455..1dfbf6d3c802c12992a2892d92cf3c120dfab71d
mode 100755,100644..100755
@@@ -705,103 -765,61 +765,154 @@@ ALTER TABLE [ IF EXISTS ] <replaceable 
     </varlistentry>
  
     <varlistentry>
 +    <term><literal>DISTRIBUTE BY</literal></term>
 +    <listitem>
 +     <para>
 +      This clause specifies how the table is distributed or replicated among Datanodes.
 +     </para>
 +
 +     <variablelist>
 +
 +      <varlistentry>
 +       <term><literal>REPLICATION</literal></term>
 +       <listitem>
 +        <para>
 +         Each row of the table will be replicated into all the
 +         Datanodes of the <productname>Postgres-XL</> database
 +         cluster.
 +        </para>
 +       </listitem>
 +      </varlistentry>
 +
 +      <varlistentry>
 +       <term><literal>ROUNDROBIN</literal></term>
 +       <listitem>
 +        <para>
 +         Each row of the table will be placed in one of the Datanodes in a
 +         round-robin manner.  The value of the row will not be needed to
 +         determine what Datanode to go.
 +        </para>
 +       </listitem>
 +      </varlistentry>
 +
 +      <varlistentry>
 +       <term><literal>HASH ( <replaceable class="PARAMETER">column_name</> )</literal></term>
 +       <listitem>
 +        <para>
 +         Each row of the table will be placed based on the hash value
 +         of the specified column.  Following type is allowed as
 +         distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR,
 +         OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, NUMERIC, MONEY,
 +         ABSTIME, RELTIME, DATE, TIME,TIMESTAMP, TIMESTAMPTZ, INTERVAL, and TIMETZ.
 +        </para>
 +        <para>
 +         Please note that floating point is not allowed as a basis of
 +         the distribution column.
 +        </para>
 +       </listitem>
 +      </varlistentry>
 +
 +      <varlistentry>
 +       <term><literal>MODULO ( <replaceable class="PARAMETER">column_name</> )</literal></term>
 +       <listitem>
 +        <para>
 +         Each row of the table will be placed based on the modulo
 +         of the specified column.  Following type is allowed as
 +         distribution column: INT8, INT2, INT4, BOOL, ABSTIME, RELTIME, 
 +         DATE.
 +        </para>
 +        <para>
 +         Please note that floating point is not allowed as a basis of
 +         the distribution column.
 +        </para>
 +       </listitem>
 +      </varlistentry>
 +     </variablelist>
+     <term><literal>ATTACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable> FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable></literal></term>
+     <listitem>
+      <para>
+       This form attaches an existing table (which might itself be partitioned)
+       as a partition of the target table using the same syntax for
+       <replaceable class="PARAMETER">partition_bound_spec</replaceable> as
+       <xref linkend="sql-createtable">.  The partition bound specification
+       must correspond to the partitioning strategy and partition key of the
+       target table.  The table to be attached must have all the same columns
+       as the target table and no more; moreover, the column types must also
+       match.  Also, it must have all the <literal>NOT NULL</literal> and
+       <literal>CHECK</literal> constraints of the target table.  Currently
+       <literal>UNIQUE</literal>, <literal>PRIMARY KEY</literal>, and
+       <literal>FOREIGN KEY</literal> constraints are not considered.
+       If any of the <literal>CHECK</literal> constraints of the table being
+       attached is marked <literal>NO INHERIT</literal>, the command will fail;
+       such a constraint must be recreated without the <literal>NO INHERIT</literal>
+       clause.
+      </para>
+      <para>
+       If the new partition is a regular table, a full table scan is performed
+       to check that no existing row in the table violates the partition
+       constraint.  It is possible to avoid this scan by adding a valid
+       <literal>CHECK</literal> constraint to the table that would allow only
+       the rows satisfying the desired partition constraint before running this
+       command.  It will be determined using such a constraint that the table
+       need not be scanned to validate the partition constraint.  This does not
+       work, however, if any of the partition keys is an expression and the
+       partition does not accept <literal>NULL</literal> values.  If attaching
+       a list partition that will not accept <literal>NULL</literal> values,
+       also add <literal>NOT NULL</literal> constraint to the partition key
+       column, unless it's an expression.
+      </para>
+      <para>
+       If the new partition is a foreign table, nothing is done to verify
+       that all the rows in the foreign table obey the partition constraint.
+       (See the discussion in <xref linkend="SQL-CREATEFOREIGNTABLE"> about
+       constraints on the foreign table.)
+      </para>
      </listitem>
     </varlistentry>
  
 -
 +   <varlistentry>
 +    <term><literal>TO GROUP</literal></term>
 +    <term><literal>TO NODE</literal></term>
 +      <listitem>
 +       <para>
 +        This defines the list of nodes on which table data exists.
 +       </para>
 +      </listitem>
 +   </varlistentry>
 +
 +   <varlistentry>
 +     <term><literal>ADD NODE</literal></term>
 +       <listitem>
 +        <para>
 +         This adds a list of nodes where data of table is distributed
 +         to the existing list. If the list of nodes added contains nodes
 +         already used by table, an error is returned.
 +        </para>
 +       </listitem>
 +   </varlistentry>
 +
 +   <varlistentry>
 +     <term><literal>DELETE NODE</literal></term>
 +       <listitem>
 +        <para>
 +         This deletes a list of nodes where the data of a table is distributed
 +         to the existing list. If the list of nodes deleted contains nodes not
 +         used by table, an error is returned.
 +        </para>
 +       </listitem>
 +   </varlistentry>
+    <varlistentry>
+     <term><literal>DETACH PARTITION</literal> <replaceable class="PARAMETER">partition_name</replaceable></term>
+     <listitem>
+      <para>
+       This form detaches specified partition of the target table.  The detached
+       partition continues to exist as a standalone table, but no longer has any
+       ties to the table from which it was detached.
+      </para>
+     </listitem>
+    </varlistentry>
    </variablelist>
    </para>
  
        </listitem>
       </varlistentry>
  
 +     <varlistentry>
 +      <term><replaceable class="PARAMETER">nodename</replaceable></term>
 +        <listitem>
 +         <para>
 +          It defines a <productname>Postgres-XL</productname> node of catalog pgxc_node.
 +         </para>
 +        </listitem>
 +     </varlistentry>
 +
 +     <varlistentry>
 +      <term><replaceable class="PARAMETER">groupname</replaceable></term>
 +        <listitem>
 +         <para>
 +          It defines a <productname>Postgres-XL</productname> node group in catalog pgxc_group.
 +         </para>
 +        </listitem>
 +     </varlistentry>
 +
+      <varlistentry>
+       <term><replaceable class="PARAMETER">partition_name</replaceable></term>
+       <listitem>
+        <para>
+         The name of the table to attach as a new partition or to detach from this table.
+        </para>
+       </listitem>
+      </varlistentry>
+      <varlistentry>
+       <term><replaceable class="PARAMETER">partition_bound_spec</replaceable></term>
+       <listitem>
+        <para>
+         The partition bound specification for a new partition.  Refer to
+         <xref linkend="sql-createtable"> for more details on the syntax of the same.
+        </para>
+       </listitem>
+      </varlistentry>
      </variablelist>
   </refsect1>
  
@@@ -1419,28 -1381,27 +1566,49 @@@ ALTER TABLE distributors DROP CONSTRAIN
      ADD CONSTRAINT distributors_pkey PRIMARY KEY USING INDEX dist_id_temp_idx;
  </programlisting></para>
  
 +  <para>
 +    To change the distribution type and the list of nodes where table data
 +    is located:
 +<programlisting>
 +ALTER TABLE distributors TO NODE (dn1, dn7), DISTRIBUTE BY HASH(dist_id);
 +</programlisting>
 +  </para>
 +
 +  <para>
 +    To add a node where data of table is distributed:
 +<programlisting>
 +ALTER TABLE distributors ADD NODE (dn9, dn14);
 +</programlisting>
 +  </para>
 +
 +  <para>
 +    To remove a node where data of table is distributed:
 +<programlisting>
 +ALTER TABLE distributors DELETE NODE (dn4, dn0);
 +</programlisting>
 +  </para>
 +
+   <para>
+    Attach a partition to range partitioned table:
+ <programlisting>
+ ALTER TABLE measurement
+     ATTACH PARTITION measurement_y2016m07 FOR VALUES FROM ('2016-07-01') TO ('2016-08-01');
+ </programlisting></para>
+   <para>
+    Attach a partition to list partitioned table:
+ <programlisting>
+ ALTER TABLE cities
+     ATTACH PARTITION cities_ab FOR VALUES IN ('a', 'b');
+ </programlisting></para>
+   <para>
+    Detach a partition from partitioned table:
+ <programlisting>
+ ALTER TABLE cities
+     DETACH PARTITION measurement_y2015m12;
+ </programlisting></para>
   </refsect1>
  
   <refsect1>
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 2b842d5fb980b7d71e787d3e7952c15f74605a02,0478e40447df1b014c1cd27be058db826ac0b0f1..8d1e4ee487f49c47fad5b83ae080f1e0eb02a887
mode 100755,100644..100755
@@@ -1698,30 -1831,25 +1979,49 @@@ CREATE TABLE cities_ab_10000_to_10000
      effect can be had using the OID feature.
     </para>
    </refsect2>
+   <refsect2>
+    <title><literal>PARTITION BY</> Clause</title>
+    <para>
+     The <literal>PARTITION BY</> clause is a
+     <productname>PostgreSQL</productname> extension.
+    </para>
+   </refsect2>
+   <refsect2>
+    <title><literal>PARTITION OF</> Clause</title>
+    <para>
+     The <literal>PARTITION OF</> clause is a
+     <productname>PostgreSQL</productname> extension.
+    </para>
+   </refsect2>
 +  <refsect2>
 +   <title><productname>Postgres-XL</> Specifics</title>
 +
 +   <para>
 +    Currently, immutable, stable, volatile functions and nextval are allowed in DEFAULT clause. 
 +    as <literal>DEFAULT</> values.
 +   </para>
 +   <para>
 +    <literal>PRIMARY KEY</> and foreign key must include the
 +    distribution column.
 +   </para>
 +   <para>
 +    <literal>TEMP</> tables and exclusion constraint are not supported
 +    yet.
 +   </para>
 +   <para>
 +   </para>
 +   <para>
 +    In <productname>Postgres-XL</>, OID is maintained locally in each
 +    Datanode and Coordinator.  The OID value may be inconsistent for rows
 +    stored in different Datanodes.
 +   </para>
 +
 +  </refsect2>
   </refsect1>
  
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index fd73840d51a0232a8e3da69d6f3774e4eb007f93,71e52c4c35583c089f0132cdb8d622c35cd937ef..a6f27ec54d4043a91a0415ef39d078ccc554c6cb
@@@ -73,8 -68,11 +69,12 @@@ PostgreSQL documentatio
         <arg choice="plain"><option>i[mmediate]</option></arg>
       </group>
     </arg>
+    <arg choice="opt"><option>-W</option></arg>
+    <arg choice="opt"><option>-t</option> <replaceable>seconds</replaceable></arg>
+    <arg choice="opt"><option>-s</option></arg>
     <arg choice="opt"><option>-o</option> <replaceable>options</replaceable></arg>
 +   <arg>-Z <replaceable>nodeopt</replaceable></arg>
+    <arg choice="opt"><option>-c</option></arg>
    </cmdsynopsis>
  
    <cmdsynopsis>
     utilities,
     also uses the environment variables supported by <application>libpq</>
     (see <xref linkend="libpq-envars">).
-    For additional server variables, see <xref linkend="app-postgres">.
+   </para>
+   <para>
+    For additional variables that affect the server,
+    see <xref linkend="app-postgres">.
    </para>
 +
 +  <para>
 +   In <productname>Postgres-XL</>, this command controls individual Coordinator or Datanode.
 +  </para>
   </refsect1>
  
  
index e72021764a349e73a3cc534360b5241f043f1d9b,defaf170dc67a9df1ada930dd41e5e4896bebe6c..a23fe18b725fcebf6691f12318bd2543a81f6f57
@@@ -283,10 -283,9 +283,15 @@@ PostgreSQL documentatio
    </para>
  
    <para>
-    In <productname>Postgres-XL</>, <command>pg_resetxlog</command>
+    <command>pg_resetwal</command> works only with servers of the same
+    major version.
+   </para>
++
++  <para>
++   In <productname>Postgres-XL</>, <command>pg_resetwal</command>
 +   will only run locally for Coordinators and Datanodes.  You should run it
 +   for each Coordinator or Datanode manually.
 +  </para>
   </refsect1>
  
   <refsect1>
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index eace76cace728b9b586c1e4ad7c789283149b94f,f1f4e91252ef852d6bbc81be69c25f30466488df..7e3e2c5c4430009a47a5a9df7b02565a1762d778
@@@ -73,8 -74,8 +74,9 @@@ For new features, add links to the docu
    The reason for splitting the release notes this way is so that appropriate
    subsets can easily be copied into back branches.
  -->
+ &release-10;
  &release-9.6;
 +&release-xl-9.5r1;
  &release-9.5;
  &release-9.4;
  &release-9.3;
Simple merge
index 714d29c4637bfa4d114dcfbc83ca9e6cbd1cdfb7,6d57525515e5acf3726bf41902c1adc8b2b8f62a..bafb099ccd4dd644122eb537892103e5eefc170a
@@@ -2203,25 -1652,27 +2255,34 @@@ $ <userinput>kill -INT `head -1 /usr/lo
     <productname>PostgreSQL</> release to a newer one.
    </para>
  
 +  <para>
 +   Because <productname>Postgres-XL</>'s Coordinators and Datanodes
 +   are essentially <productname>PostgreSQL</> servers, you can follw
 +   the steps described below to upgrade each of them.  Please note
 +   that you should do this manually.
 +  </para>
 +
    <para>
-    <productname>PostgreSQL</> major versions are represented by the
-    first two digit groups of the version number, e.g., 8.4.
-    <productname>PostgreSQL</> minor versions are represented by the
-    third group of version digits, e.g., 8.4.2 is the second minor
-    release of 8.4.  Minor releases never change the internal storage
-    format and are always compatible with earlier and later minor
-    releases of the same major version number, e.g., 8.4.2 is compatible
-    with 8.4, 8.4.1 and 8.4.6.  To update between compatible versions,
-    you simply replace the executables while the server is down and
-    restart the server.  The data directory remains unchanged &mdash;
-    minor upgrades are that simple.
+    Current <productname>PostgreSQL</productname> version numbers consist of a
+    major and a minor version number.  For example, in the version number 10.1,
+    the 10 is the major version number and the 1 is the minor version number,
+    meaning this would be the first minor release of the major release 10.  For
+    releases before <productname>PostgreSQL</productname> version 10.0, version
+    numbers consist of three numbers, for example, 9.5.3.  In those cases, the
+    major version consists of the first two digit groups of the version number,
+    e.g., 9.5, and the minor version is the third number, e.g., 3, meaning this
+    would be the third minor release of the major release 9.5.
+   </para>
+   <para>
+    Minor releases never change the internal storage format and are always
+    compatible with earlier and later minor releases of the same major version
+    number.  For example, version 10.1 is compatible with version 10.0 and
+    version 10.6.  Similarly, for example, 9.5.3 is compatible with 9.5.0,
+    9.5.1, and 9.5.6.  To update between compatible versions, you simply
+    replace the executables while the server is down and restart the server.
+    The data directory remains unchanged &mdash; minor upgrades are that
+    simple.
    </para>
  
    <para>
index 08cafe129b81b7e726faf5634805d27f38365bc9,f8f6bf2de1dabf1a9a64330aacee304cb3c5b6d6..59b996edc90a0fb211db789755a14dbc62ccc428
@@@ -64,10 -64,10 +64,10 @@@ git clone git://git.postgresql.org/git/
      <para>
       The Git mirror can also be reached via the HTTP protocol, if for example
       a firewall is blocking access to the Git protocol. Just change the URL
-      prefix to <literal>http</>, as in:
+      prefix to <literal>https</>, as in:
  
  <programlisting>
- git clone http://git.postgresql.org/git/postgres-xl.git
 -git clone https://git.postgresql.org/git/postgresql.git
++git clone https://git.postgresql.org/git/postgres-xl.git
  </programlisting>
  
       The HTTP protocol is less efficient than the Git protocol, so it will be
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc src/Makefile
index 5706bb13352b6348e984dc4a51441eead78318a6,380da92c75ef4aa6c4d08ebd15b438d3af845e4d..79cfeeb7102c6efd1d48ba35afdc158ecb96a9bd
@@@ -22,7 -20,9 +22,8 @@@ SUBDIRS = 
        backend/utils/mb/conversion_procs \
        backend/snowball \
        include \
 -      interfaces \
        backend/replication/libpqwalreceiver \
+       backend/replication/pgoutput \
        fe_utils \
        bin \
        pl \
Simple merge
Simple merge
index faec8d852310dc59837a9b0085d1f3ea45fadcb6,bce9d2c3ebb09a06f2e7e1a39a097ef37ce749d0..d9aec0e0a41f71caffcd6b7c83158094279acbd1
@@@ -17,13 -17,9 +17,13 @@@ subdir = src/backen
  top_builddir = ../..
  include $(top_builddir)/src/Makefile.global
  
 +ifneq ($(PORTNAME), win32)
 +override CFLAGS += $(PTHREAD_CFLAGS)
 +endif
 +
  SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \
 -      main nodes optimizer port postmaster regex replication rewrite \
 -      statistics storage tcop tsearch utils $(top_builddir)/src/timezone
 +      pgxc main nodes optimizer port postmaster regex replication rewrite \
-       storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
++      statistics storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
  
  include $(srcdir)/common.mk
  
@@@ -39,24 -35,8 +39,25 @@@ LOCALOBJS += utils/probes.
  endif
  endif
  
 -OBJS = $(SUBDIROBJS) $(LOCALOBJS) $(top_builddir)/src/port/libpgport_srv.a \
 -       $(top_builddir)/src/common/libpgcommon_srv.a
 +OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
 +      $(top_builddir)/src/port/libpgport_srv.a \
 +       $(top_builddir)/src/common/libpgcommon_srv.a \
 +      $(top_builddir)/src/interfaces/libpq/fe-connect.o \
 +      $(top_builddir)/src/interfaces/libpq/fe-secure.o \
 +      $(top_builddir)/src/interfaces/libpq/fe-misc.o \
 +      $(top_builddir)/src/interfaces/libpq/fe-protocol3.o \
 +      $(top_builddir)/src/interfaces/libpq/fe-protocol2.o \
 +      $(top_builddir)/src/interfaces/libpq/fe-exec.o \
 +      $(top_builddir)/src/interfaces/libpq/fe-auth.o \
 +      $(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \
++      $(top_builddir)/src/interfaces/libpq/fe-auth-scram.o \
 +      $(top_builddir)/src/gtm/client/libgtmclient.a \
 +      $(top_builddir)/src/gtm/common/libgtm.a \
 +      $(top_builddir)/src/gtm/libpq/libpqcomm.a
 +
 +ifeq ($(with_openssl), yes)
 +OBJS += $(top_builddir)/src/interfaces/libpq/fe-secure-openssl.o
 +endif
  
  # We put libpgport and libpgcommon into OBJS, so remove it from LIBS; also add
  # libldap
@@@ -78,7 -58,7 +79,7 @@@ ifneq ($(PORTNAME), win32
  ifneq ($(PORTNAME), aix)
  
  postgres: $(OBJS)
-       $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
 -      $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@
++      $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@
  
  endif
  endif
index 15a18a51cc4924f4750109890084230e1089fc0e,c0086ded62d95208ea943681ed5380dbadf1fec6..970e3aa6c95798e7396e1bf470fb5c00aace60c4
@@@ -45,8 -45,7 +45,8 @@@
   * and we'd like to still refer to them via C struct offsets.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index a44be6f96f1616f7837043aa2b04c39275edf604,a2ca2d74aefb36f5f71b80cadec2ca989f30d072..78704dafd910eee09fd715ea811cec5b0a9262a8
@@@ -5,8 -5,7 +5,8 @@@
   *      clients and standalone backends are supported here).
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index bb9adad82ef55b60dc03bf61060d7d687ebb0b8e,289d766419aa080a5bfdb5c059bf3bac206f75ea..4089fd6d8ad10a2a227689a32e82d78256a94b61
  #include "postgres.h"
  
  #include "access/hash.h"
+ #include "utils/builtins.h"
  
 +#ifdef PGXC
 +#include "catalog/pg_type.h"
 +#include "utils/builtins.h"
 +#include "utils/timestamp.h"
 +#include "utils/date.h"
 +#include "utils/nabstime.h"
 +#endif
 +
+ /*
+  * Datatype-specific hash functions.
+  *
+  * These support both hash indexes and hash joins.
+  *
+  * NOTE: some of these are also used by catcache operations, without
+  * any direct connection to hash indexes.  Also, the common hash_any
+  * routine is also used by dynahash tables.
+  */
  /* Note: this is used for both "char" and boolean datatypes */
  Datum
  hashchar(PG_FUNCTION_ARGS)
@@@ -531,190 -523,3 +531,186 @@@ hash_uint32(uint32 k
        /* report the result */
        return UInt32GetDatum(c);
  }
-               case INT2VECTOROID:
-                       return DirectFunctionCall1(hashint2vector, value);
 +
 +#ifdef PGXC
 +/*
 + * compute_hash()
 + * Generic hash function for all datatypes
 + */
 +Datum
 +compute_hash(Oid type, Datum value, char locator)
 +{
 +      int16   tmp16;
 +      int32   tmp32;
 +      int64   tmp64;
 +      Oid             tmpoid;
 +      char    tmpch;
 +
 +      switch (type)
 +      {
 +              case INT8OID:
 +                      /* This gives added advantage that
 +                       *      a = 8446744073709551359
 +                       * and  a = 8446744073709551359::int8 both work*/
 +                      tmp64 = DatumGetInt64(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashint8, value);
 +                      return tmp64;
 +              case INT2OID:
 +                      tmp16 = DatumGetInt16(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashint2, tmp16);
 +                      return tmp16;
 +              case OIDOID:
 +                      tmpoid = DatumGetObjectId(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashoid, tmpoid);
 +                      return tmpoid;
 +              case INT4OID:
 +                      tmp32 = DatumGetInt32(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashint4, tmp32);
 +                      return tmp32;
 +              case BOOLOID:
 +                      tmpch = DatumGetBool(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashchar, tmpch);
 +                      return tmpch;
 +
 +              case CHAROID:
 +                      return DirectFunctionCall1(hashchar, value);
 +              case NAMEOID:
 +                      return DirectFunctionCall1(hashname, value);
-               case INT2VECTOROID:
-                       return "hashint2vector";
 +
 +              case VARCHAROID:
 +              case TEXTOID:
 +                      return DirectFunctionCall1(hashtext, value);
 +
 +              case OIDVECTOROID:
 +                      return DirectFunctionCall1(hashoidvector, value);
 +              case FLOAT4OID:
 +                      return DirectFunctionCall1(hashfloat4, value);
 +              case FLOAT8OID:
 +                      return DirectFunctionCall1(hashfloat8, value);
 +
 +              case ABSTIMEOID:
 +                      tmp32 = DatumGetAbsoluteTime(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashint4, tmp32);
 +                      return tmp32;
 +              case RELTIMEOID:
 +                      tmp32 = DatumGetRelativeTime(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashint4, tmp32);
 +                      return tmp32;
 +              case CASHOID:
 +                      return DirectFunctionCall1(hashint8, value);
 +
 +              case BPCHAROID:
 +                      return DirectFunctionCall1(hashbpchar, value);
 +              case BYTEAOID:
 +                      return DirectFunctionCall1(hashvarlena, value);
 +
 +              case DATEOID:
 +                      tmp32 = DatumGetDateADT(value);
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return DirectFunctionCall1(hashint4, tmp32);
 +                      return tmp32;
 +              case TIMEOID:
 +                      return DirectFunctionCall1(time_hash, value);
 +              case TIMESTAMPOID:
 +                      return DirectFunctionCall1(timestamp_hash, value);
 +              case TIMESTAMPTZOID:
 +                      return DirectFunctionCall1(timestamp_hash, value);
 +              case INTERVALOID:
 +                      return DirectFunctionCall1(interval_hash, value);
 +              case TIMETZOID:
 +                      return DirectFunctionCall1(timetz_hash, value);
 +
 +              case NUMERICOID:
 +                      return DirectFunctionCall1(hash_numeric, value);
 +              default:
 +                      ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
 +      }
 +      /* Control should not come here. */
 +      ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
 +      /* Keep compiler silent */
 +      return (Datum)0;
 +}
 +
 +
 +/*
 + * get_compute_hash_function
 + * Get hash function name depending on the hash type.
 + * For some cases of hash or modulo distribution, a function might
 + * be required or not.
 + */
 +char *
 +get_compute_hash_function(Oid type, char locator)
 +{
 +      switch (type)
 +      {
 +              case INT8OID:
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return "hashint8";
 +                      return NULL;
 +              case INT2OID:
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return "hashint2";
 +                      return NULL;
 +              case OIDOID:
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return "hashoid";
 +                      return NULL;
 +              case DATEOID:
 +              case INT4OID:
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return "hashint4";
 +                      return NULL;
 +              case BOOLOID:
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return "hashchar";
 +                      return NULL;
 +              case CHAROID:
 +                      return "hashchar";
 +              case NAMEOID:
 +                      return "hashname";
 +              case VARCHAROID:
 +              case TEXTOID:
 +                      return "hashtext";
 +              case OIDVECTOROID:
 +                      return "hashoidvector";
 +              case FLOAT4OID:
 +                      return "hashfloat4";
 +              case FLOAT8OID:
 +                      return "hashfloat8";
 +              case RELTIMEOID:
 +              case ABSTIMEOID:
 +                      if (locator == LOCATOR_TYPE_HASH)
 +                              return "hashint4";
 +                      return NULL;
 +              case CASHOID:
 +                              return "hashint8";
 +              case BPCHAROID:
 +                      return "hashbpchar";
 +              case BYTEAOID:
 +                      return "hashvarlena";
 +              case TIMEOID:
 +                      return "time_hash";
 +              case TIMESTAMPOID:
 +              case TIMESTAMPTZOID:
 +                      return "timestamp_hash";
 +              case INTERVALOID:
 +                      return "interval_hash";
 +              case TIMETZOID:
 +                      return "timetz_hash";
 +              case NUMERICOID:
 +                      return "hash_numeric";
 +              default:
 +                      ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
 +      }
 +
 +      /* Keep compiler quiet */
 +      return NULL;
 +}
 +#endif
Simple merge
Simple merge
Simple merge
index 0fcccfc3a700229c650584a0d3d1757e089a64d4,bece57589e80ebceaacca0db0f8775a9ed3ba8f6..9c6964d79c9aa3e8b29dd8fd8a1b3f51ed899097
   * for aborts (whether sync or async), since the post-crash assumption would
   * be that such transactions failed anyway.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/backend/access/transam/clog.c
   *
index c36a92cdb93effdb3f24856c6af375b6b4a7c5ce,2dad3e8a655e1e15cb1d6826d419e369a7d7029e..d3585c8449d1d464f8fa5751b65a929d5d0e7fa3
@@@ -24,7 -25,7 +25,8 @@@
  #include "libpq/pqmq.h"
  #include "miscadmin.h"
  #include "optimizer/planmain.h"
+ #include "pgstat.h"
 +#include "pgxc/pgxcnode.h"
  #include "storage/ipc.h"
  #include "storage/sinval.h"
  #include "storage/spin.h"
index 5624eeca84b50c7091aa7cd59771dba7dcec6a05,de4e38f9fe316fdedcf60050349382c8951f5b94..acb81afd66cc891b633d8789be173e1060cc3a84
  # must set a recovery target.
  #
  # You may set a recovery target either by transactionId, by name,
- # or by timestamp or by barrier. Recovery may either include or exclude the
- # transaction(s) with the recovery target value (ie, stop either
- # just after or just before the given target, respectively). In case of
- # barrier, the recovery stops exactly at that point.
 -# by timestamp or by WAL location (LSN). Recovery may either include or
 -# exclude the transaction(s) with the recovery target value (ie, stop either
 -# just after or just before the given target, respectively).
++# or by timestamp or by WAL location (LSN) or by barrier. Recovery may either
++# include or exclude the transaction(s) with the recovery target value (ie,
++# stop either just after or just before the given target, respectively). In
++# case of barrier, the recovery stops exactly at that point.
  #
  #
  #recovery_target_name = ''    # e.g. 'daily backup 2011-01-26'
  #
  #recovery_target_xid = ''
  #
 +#recovery_target_barrier = ''
 +#
+ #recovery_target_lsn = ''     # e.g. '0/70006B8'
+ #
  #recovery_target_inclusive = true
  #
  #
Simple merge
Simple merge
index 76069546cbccfc50bd3e0686858c58a575cd60e0,cef03f83e03f3be28d02945c152971b1527ab4b1..a0390bf25b827cf98a6646678ec21b4cddb00b47
@@@ -19,9 -18,8 +18,9 @@@
   * data across crashes.  During database startup, we simply force the
   * currently-active page of SUBTRANS to zeroes.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/backend/access/transam/subtrans.c
   *
Simple merge
index b65227922bdbce2c7d15f64f0fe08a0a0fe0cd38,c50f9c4bf6537d882cfe8ea877770eaa0200cde2..f6986d37db950a1a6ae8df236d9c942aa8ce4ff9
@@@ -3,9 -3,8 +3,9 @@@
   * twophase.c
   *            Two-phase commit support functions.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * IDENTIFICATION
   *            src/backend/access/transam/twophase.c
index a4e67d9fc393afe8fbd1c0bc9d4c2471111daafd,b02dd6fbd25a8dfdd30b2c6578ec24617ef7958b..d94a1deeb15ea7d9ff28454043422d449c17d0ef
@@@ -3,9 -3,7 +3,9 @@@
   * varsup.c
   *      postgres OID & XID variables support routines
   *
-  * Copyright (c) 2000-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+  * Copyright (c) 2000-2017, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
   *      src/backend/access/transam/varsup.c
index 049aabc20996583bd4e6b7fc57ea6b99146ab3dd,7e8c598f2adc191a34f2bb5424a3a480cc342888..77666c4b80113a2fc1401e3be700416988d9d258
@@@ -5,10 -5,8 +5,10 @@@
   *
   * See src/backend/access/transam/README for more information.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -421,20 -329,10 +424,20 @@@ static void AtSubCommit_Memory(void)
  static void AtSubStart_Memory(void);
  static void AtSubStart_ResourceOwner(void);
  
 +#ifdef XCP
 +static void AtSubCommit_WaitedXids(void);
 +static void AtSubAbort_WaitedXids(void);
 +static void AtEOXact_WaitedXids(void);
 +static void TransactionRecordXidWait_Internal(TransactionState s,
 +              TransactionId xid);
 +#endif
 +
  static void ShowTransactionState(const char *str);
- static void ShowTransactionStateRec(TransactionState state);
+ static void ShowTransactionStateRec(const char *str, TransactionState state);
  static const char *BlockStateAsString(TBlockState blockState);
  static const char *TransStateAsString(TransState state);
 +static void PrepareTransaction(void);
 +static void AtEOXact_GlobalTxn(bool commit);
  
  
  /* ----------------------------------------------------------------
@@@ -2142,24 -1843,11 +2142,24 @@@ StartTransaction(void
        {
                s->startedInRecovery = false;
                XactReadOnly = DefaultXactReadOnly;
 +#ifdef PGXC
 +              /* Save Postgres-XC session as read-only if necessary */
 +              XactReadOnly |= IsPGXCNodeXactReadOnly();
 +#endif
        }
        XactDeferrable = DefaultXactDeferrable;
 +#ifdef PGXC
 +      /* PGXCTODO - PGXC doesn't support 9.1 serializable transactions. They are
 +       * silently turned into repeatable-reads which is same as pre 9.1
 +       * serializable isolation level
 +       */
 +      if (DefaultXactIsoLevel == XACT_SERIALIZABLE)
 +              DefaultXactIsoLevel = XACT_REPEATABLE_READ;
 +#endif
        XactIsoLevel = DefaultXactIsoLevel;
        forceSyncCommit = false;
-       MyXactAccessedTempRel = false;
 +      XactLocalNodePrepared = false;
+       MyXactFlags = 0;
  
        /*
         * reinitialize within-transaction counters
@@@ -2498,14 -2037,10 +2498,14 @@@ CommitTransaction(void
        if (!is_parallel_worker)
        {
                /*
-                * We need to mark our XIDs as committed in pg_clog.  This is where we
+                * We need to mark our XIDs as committed in pg_xact.  This is where we
                 * durably commit.
                 */
 -              latestXid = RecordTransactionCommit();
 +#ifdef XCP
 +              latestXid = InvalidTransactionId;
 +              if (!IsConnFromDatanode())
 +#endif
 +                      latestXid = RecordTransactionCommit();
        }
        else
        {
index 19b4921075b91e063658c656c473a5d35aca77e8,399822d3fead60e0302169ac007ff8bc042a8fd6..b29f283e6a318babd2e364123e9c7cc87ecd7db9
  #include "catalog/pg_database.h"
  #include "commands/tablespace.h"
  #include "miscadmin.h"
 +#ifdef PGXC
 +#include "pgxc/barrier.h"
 +#endif
  #include "pgstat.h"
+ #include "port/atomics.h"
  #include "postmaster/bgwriter.h"
  #include "postmaster/walwriter.h"
  #include "postmaster/startup.h"
@@@ -256,8 -262,8 +265,9 @@@ static bool recoveryTargetInclusive = t
  static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
 +static char *recoveryTargetBarrierId;
  static char *recoveryTargetName;
+ static XLogRecPtr recoveryTargetLSN;
  static int    recovery_min_apply_delay = 0;
  static TimestampTz recoveryDelayUntilTime;
  
@@@ -5404,17 -5626,31 +5642,35 @@@ recoveryStopsBefore(XLogReaderState *re
  
                recoveryStopAfter = false;
                recoveryStopXid = InvalidTransactionId;
+               recoveryStopLSN = InvalidXLogRecPtr;
+               recoveryStopTime = 0;
+               recoveryStopName[0] = '\0';
+               return true;
+       }
+       /* Check if target LSN has been reached */
+       if (recoveryTarget == RECOVERY_TARGET_LSN &&
+               !recoveryTargetInclusive &&
+               record->ReadRecPtr >= recoveryTargetLSN)
+       {
+               recoveryStopAfter = false;
+               recoveryStopXid = InvalidTransactionId;
+               recoveryStopLSN = record->ReadRecPtr;
                recoveryStopTime = 0;
                recoveryStopName[0] = '\0';
+               ereport(LOG,
+                        (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
+                                        (uint32) (recoveryStopLSN >> 32),
+                                        (uint32) recoveryStopLSN)));
                return true;
        }
 -
 +#ifdef PGXC
 +      /* Otherwise we only consider stopping before COMMIT, ABORT or BARRIER records. */
 +      if ((XLogRecGetRmid(record) != RM_XACT_ID) && (XLogRecGetRmid(record) != RM_BARRIER_ID))
 +#else         
        /* Otherwise we only consider stopping before COMMIT or ABORT records. */
        if (XLogRecGetRmid(record) != RM_XACT_ID)
 +#endif                
                return false;
  
        xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
index 1bdbea655bb646e452aa1fe9b66cc8b58ef4a75d,7430a1f77b456f58f8319e459d8580764aa901a2..4f67dc62fb73a29b3ee97e96d09aa6982b618f30
@@@ -19,7 -19,7 +19,8 @@@
  
  #include <unistd.h>
  
 +#include "miscadmin.h"
+ #include "access/timeline.h"
  #include "access/xlog.h"
  #include "access/xlog_internal.h"
  #include "access/xlogutils.h"
index 86732f73d87138b7cf450662fa302454eab4fb36,4c28b2b821a767ef7a262312b6102bc844244f30..c2274ae2ff4dea62c371a57dcd7bcf69f87d3544
@@@ -4,10 -4,8 +4,10 @@@
   *      routines to support running postgres in 'bootstrap' mode
   *    bootstrap mode is used to create the initial template database
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * IDENTIFICATION
   *      src/backend/bootstrap/bootstrap.c
index 240c44d0f0d8969728a8d644e6e131adb1ccbbc6,fd33426bad15164500bb8189f65f808758107f27..52bc63c78869cadda780a9884dcfedffa8764571
@@@ -11,11 -11,12 +11,12 @@@ top_builddir = ../../.
  include $(top_builddir)/src/Makefile.global
  
  OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \
-        objectaccess.o objectaddress.o pg_aggregate.o pg_collation.o \
+        objectaccess.o objectaddress.o partition.o pg_aggregate.o pg_collation.o \
         pg_constraint.o pg_conversion.o \
         pg_depend.o pg_enum.o pg_inherits.o pg_largeobject.o pg_namespace.o \
-        pg_operator.o pg_proc.o pg_range.o pg_db_role_setting.o pg_shdepend.o \
-        pg_type.o pgxc_class.o storage.o toasting.o
+        pg_operator.o pg_proc.o pg_publication.o pg_range.o \
+          pg_db_role_setting.o pg_shdepend.o pg_subscription.o pg_type.o \
 -         storage.o toasting.o
++         pgxc_class.o storage.o toasting.o
  
  BKIFILES = postgres.bki postgres.description postgres.shdescription
  
@@@ -39,10 -41,11 +41,12 @@@ POSTGRES_BKI_SRCS = $(addprefix $(top_s
        pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
        pg_ts_parser.h pg_ts_template.h pg_extension.h \
        pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
 +      pgxc_class.h pgxc_node.h pgxc_group.h \
        pg_foreign_table.h pg_policy.h pg_replication_origin.h \
        pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \
-       pg_collation.h pg_range.h pg_transform.h \
+       pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \
+       pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \
+       pg_subscription_rel.h toasting.h indexing.h \
        toasting.h indexing.h \
      )
  
index 9bb937aa4c2b09faa39ebc5c982da1eede70412d,11ee536726ce08bf22b87bc8a087e2f73869e821..2e8cd10ebb24dbe7cbb617ab07683cdc12745d88
@@@ -5,8 -5,7 +5,8 @@@
   *            bits of hard-wired knowledge
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "catalog/pg_shdepend.h"
  #include "catalog/pg_shdescription.h"
  #include "catalog/pg_shseclabel.h"
+ #include "catalog/pg_subscription.h"
  #include "catalog/pg_tablespace.h"
  #include "catalog/toasting.h"
 +#include "catalog/pgxc_node.h"
 +#include "catalog/pgxc_group.h"
  #include "miscadmin.h"
  #include "storage/fd.h"
  #include "utils/fmgroids.h"
@@@ -232,12 -227,9 +233,13 @@@ IsSharedRelation(Oid relationId
                relationId == SharedDependRelationId ||
                relationId == SharedSecLabelRelationId ||
                relationId == TableSpaceRelationId ||
 +#ifdef PGXC
 +              relationId == PgxcGroupRelationId ||
 +              relationId == PgxcNodeRelationId ||
 +#endif
                relationId == DbRoleSettingRelationId ||
-               relationId == ReplicationOriginRelationId)
+               relationId == ReplicationOriginRelationId ||
+               relationId == SubscriptionRelationId)
                return true;
        /* These are their indexes (see indexing.h) */
        if (relationId == AuthIdRolnameIndexId ||
                relationId == SharedSecLabelObjectIndexId ||
                relationId == TablespaceOidIndexId ||
                relationId == TablespaceNameIndexId ||
 +#ifdef PGXC
 +              relationId == PgxcNodeNodeNameIndexId ||
 +              relationId == PgxcNodeNodeIdIndexId ||
 +              relationId == PgxcNodeOidIndexId ||
 +              relationId == PgxcGroupGroupNameIndexId ||
 +              relationId == PgxcGroupOidIndexId ||
 +#endif
                relationId == DbRoleSettingDatidRolidIndexId ||
                relationId == ReplicationOriginIdentIndex ||
-               relationId == ReplicationOriginNameIndex)
+               relationId == ReplicationOriginNameIndex ||
+               relationId == SubscriptionObjectIndexId ||
+               relationId == SubscriptionNameIndexId)
                return true;
        /* These are their toast tables and toast indexes (see toasting.h) */
        if (relationId == PgShdescriptionToastTable ||
index 467d9ead0efed79b914d34ead0982738facbec7d,cd82cb9f29a8e99e8d57620e154e686d6134579a..f8e560a8d4449be5771d22b06f18616825f89d58
@@@ -4,10 -4,8 +4,10 @@@
   *      Routines to support inter-object dependencies.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * IDENTIFICATION
   *      src/backend/catalog/dependency.c
@@@ -174,17 -168,17 +181,21 @@@ static const Oid object_classes[] = 
        UserMappingRelationId,          /* OCLASS_USER_MAPPING */
        DefaultAclRelationId,           /* OCLASS_DEFACL */
        ExtensionRelationId,            /* OCLASS_EXTENSION */
 +#ifdef PGXC
 +      PgxcClassRelationId,            /* OCLASS_PGXCCLASS */
 +      PgxcNodeRelationId,                     /* OCLASS_PGXC_NODE */
 +      PgxcGroupRelationId,            /* OCLASS_PGXC_GROUP */
 +#endif
        EventTriggerRelationId,         /* OCLASS_EVENT_TRIGGER */
        PolicyRelationId,                       /* OCLASS_POLICY */
+       PublicationRelationId,          /* OCLASS_PUBLICATION */
+       PublicationRelRelationId,       /* OCLASS_PUBLICATION_REL */
+       SubscriptionRelationId,         /* OCLASS_SUBSCRIPTION */
        TransformRelationId                     /* OCLASS_TRANSFORM */
  };
  
 -
  static void findDependentObjects(const ObjectAddress *object,
+                                        int objflags,
                                         int flags,
                                         ObjectAddressStack *stack,
                                         ObjectAddresses *targetObjects,
@@@ -407,174 -425,6 +442,94 @@@ performMultipleDeletions(const ObjectAd
        heap_close(depRel, RowExclusiveLock);
  }
  
- /*
-  * deleteWhatDependsOn: attempt to drop everything that depends on the
-  * specified object, though not the object itself.  Behavior is always
-  * CASCADE.
-  *
-  * This is currently used only to clean out the contents of a schema
-  * (namespace): the passed object is a namespace.  We normally want this
-  * to be done silently, so there's an option to suppress NOTICE messages.
-  *
-  * Note we don't fire object drop event triggers here; it would be wrong to do
-  * so for the current only use of this function, but if more callers are added
-  * this might need to be reconsidered.
-  */
- void
- deleteWhatDependsOn(const ObjectAddress *object,
-                                       bool showNotices)
- {
-       Relation        depRel;
-       ObjectAddresses *targetObjects;
-       int                     i;
-       /*
-        * We save some cycles by opening pg_depend just once and passing the
-        * Relation pointer down to all the recursive deletion steps.
-        */
-       depRel = heap_open(DependRelationId, RowExclusiveLock);
-       /*
-        * Acquire deletion lock on the target object.  (Ideally the caller has
-        * done this already, but many places are sloppy about it.)
-        */
-       AcquireDeletionLock(object, 0);
-       /*
-        * Construct a list of objects to delete (ie, the given object plus
-        * everything directly or indirectly dependent on it).
-        */
-       targetObjects = new_object_addresses();
-       findDependentObjects(object,
-                                                DEPFLAG_ORIGINAL,
-                                                NULL,  /* empty stack */
-                                                targetObjects,
-                                                NULL,  /* no pendingObjects */
-                                                &depRel);
-       /*
-        * Check if deletion is allowed, and report about cascaded deletes.
-        */
-       reportDependentObjects(targetObjects,
-                                                  DROP_CASCADE,
-                                                  showNotices ? NOTICE : DEBUG2,
-                                                  object);
-       /*
-        * Delete all the objects in the proper order, except we skip the original
-        * object.
-        */
-       for (i = 0; i < targetObjects->numrefs; i++)
-       {
-               ObjectAddress *thisobj = targetObjects->refs + i;
-               ObjectAddressExtra *thisextra = targetObjects->extras + i;
-               if (thisextra->flags & DEPFLAG_ORIGINAL)
-                       continue;
-               /*
-                * Since this function is currently only used to clean out temporary
-                * schemas, we pass PERFORM_DELETION_INTERNAL here, indicating that
-                * the operation is an automatic system operation rather than a user
-                * action.  If, in the future, this function is used for other
-                * purposes, we might need to revisit this.
-                */
-               deleteOneObject(thisobj, &depRel, PERFORM_DELETION_INTERNAL);
-       }
-       /* And clean up */
-       free_object_addresses(targetObjects);
-       heap_close(depRel, RowExclusiveLock);
- }
 +#ifdef PGXC
 +/*
 + * Check type and class of the given object and rename it properly on GTM
 + */
 +static void
 +doRename(const ObjectAddress *object, const char *oldname, const char *newname)
 +{
 +      switch (getObjectClass(object))
 +      {
 +              case OCLASS_CLASS:
 +              {
 +                      char        relKind = get_rel_relkind(object->objectId);
 +
 +                      /*
 +                       * If we are here, a schema is being renamed, a sequence depends on it.
 +                       * as sequences' global name use the schema name, this sequence
 +                       * has also to be renamed on GTM.
 +                       * An operation with GTM can just be done from a remote Coordinator.
 +                       */
 +                      if (relKind == RELKIND_SEQUENCE &&
 +                              IS_PGXC_LOCAL_COORDINATOR)
 +                      {
 +                              Relation relseq = relation_open(object->objectId, AccessShareLock);
 +                              char *seqname = GetGlobalSeqName(relseq, NULL, oldname);
 +                              char *newseqname = GetGlobalSeqName(relseq, NULL, newname);
 +
 +                              /* We also need to rename this sequence on GTM, it has a global name ! */
 +                              if (RenameSequenceGTM(seqname, newseqname) < 0)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_CONNECTION_FAILURE),
 +                                                       errmsg("GTM error, could not rename sequence")));
 +
 +
 +                              pfree(seqname);
 +                              pfree(newseqname);
 +
 +                              relation_close(relseq, AccessShareLock);
 +                      }
 +              }
 +              default:
 +                      /* Nothing to do, this object has not to be renamed, end of the story... */
 +                      break;
 +      }
 +}
 +
 +/*
 + * performRename: used to rename objects
 + * on GTM depending on another object(s)
 + */
 +void
 +performRename(const ObjectAddress *object, const char *oldname, const char *newname)
 +{
 +      Relation    depRel;
 +      ObjectAddresses *targetObjects;
 +      int i;
 +
 +      /*
 +       * Check the dependencies on this object
 +       * And rename object dependent if necessary
 +       */
 +
 +      depRel = heap_open(DependRelationId, RowExclusiveLock);
 +
 +      targetObjects = new_object_addresses();
 +
 +      findDependentObjects(object,
 +                                               DEPFLAG_ORIGINAL,
++                                               0, /* XXX seems like flags are only used while
++                                                         dropping objects */
 +                                               NULL,      /* empty stack */
 +                                               targetObjects,
 +                                               NULL,
 +                                               &depRel);
 +
 +      /* Check Objects one by one to see if some of them have to be renamed on GTM */
 +      for (i = 0; i < targetObjects->numrefs; i++)
 +      {
 +              ObjectAddress *thisobj = targetObjects->refs + i;
 +              doRename(thisobj, oldname, newname);
 +      }
 +
 +      /* And clean up */
 +      free_object_addresses(targetObjects);
 +
 +      heap_close(depRel, RowExclusiveLock);
 +}
 +#endif
 +
  /*
   * findDependentObjects - find all objects that depend on 'object'
   *
@@@ -1259,54 -1125,12 +1230,60 @@@ doDeletion(const ObjectAddress *object
                                                heap_drop_with_catalog(object->objectId);
                                }
  
+                               /*
+                                * for a sequence, in addition to dropping the heap, also
+                                * delete pg_sequence tuple
+                                */
+                               if (relKind == RELKIND_SEQUENCE)
+                                       DeleteSequenceTuple(object->objectId);
 +#ifdef PGXC
 +                              /*
 +                               * Do not do extra process if this session is connected to a remote
 +                               * Coordinator.
 +                               */
 +                              if (IsConnFromCoord())
 +                                      break;
 +
 +                              /*
 +                               * This session is connected directly to application, so extra
 +                               * process related to remote nodes and GTM is needed.
 +                               */
 +                              switch (relKind)
 +                              {
 +                                      case RELKIND_SEQUENCE:
 +                                              /*
 +                                               * Drop the sequence on GTM.
 +                                               * Sequence is dropped on GTM by a remote Coordinator only
 +                                               * for a non temporary sequence.
 +                                               */
 +                                              {
 +                                                      /*
 +                                                       * The sequence has already been removed from Coordinator,
 +                                                       * finish the stuff on GTM too
 +                                                       */
 +
 +                                                      Relation relseq;
 +                                                      char *seqname;
 +                                                      /*
 +                                                       * A relation is opened to get the schema and database name as
 +                                                       * such data is not available before when dropping a function.
 +                                                       */
 +                                                      relseq = relation_open(object->objectId, AccessShareLock);
 +                                                      seqname = GetGlobalSeqName(relseq, NULL, NULL);
 +                                                      DropSequenceGTM(seqname, GTM_SEQ_FULL_NAME);
 +                                                      pfree(seqname);
 +
 +                                                      /* Then close the relation opened previously */
 +                                                      relation_close(relseq, AccessShareLock);
 +                                              }
 +                                              break;
 +                                      case RELKIND_RELATION:
 +                                      case RELKIND_VIEW:
 +                                              break;
 +                                      default:
 +                                              break;
 +                              }
 +#endif /* PGXC */
                                break;
                        }
  
                        DropTransformById(object->objectId);
                        break;
  
-               default:
-                       elog(ERROR, "unrecognized object class: %u",
-                                object->classId);
+                       /*
+                        * These global object types are not supported here.
+                        */
+               case OCLASS_ROLE:
+               case OCLASS_DATABASE:
+               case OCLASS_TBLSPACE:
+               case OCLASS_SUBSCRIPTION:
++              case OCLASS_PGXC_NODE:
++              case OCLASS_PGXC_GROUP:
+                       elog(ERROR, "global objects cannot be deleted by doDeletion");
+                       break;
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
++
        }
  }
  
Simple merge
index a1df27d43ff465dc57d935b9f3536884f192b5d9,0ce94f346f56d856937ba629ba70d215d7b24680..ea3d2ade219d2ab63dccd661408500c94e1ba0eb
@@@ -3,9 -3,8 +3,9 @@@
   * heap.c
   *      code to create and destroy POSTGRES heap relations
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -190,27 -186,10 +196,27 @@@ static FormData_pg_attribute a6 = 
  static FormData_pg_attribute a7 = {
        0, {"tableoid"}, OIDOID, 0, sizeof(Oid),
        TableOidAttributeNumber, 0, -1, -1,
-       true, 'p', 'i', true, false, false, true, 0
+       true, 'p', 'i', true, false, '\0', false, true, 0
  };
  
 +#ifdef PGXC
 +/*
 + * In XC we need some sort of node identification for each tuple
 + * We are adding another system column that would serve as node identifier.
 + * This is not only required by WHERE CURRENT OF but it can be used any
 + * where we want to know the originating Datanode of a tuple received
 + * at the Coordinator
 + */
 +static FormData_pg_attribute a8 = {
 +      0, {"xc_node_id"}, INT4OID, 0, sizeof(int32),
 +      XC_NodeIdAttributeNumber, 0, -1, -1,
 +      true, 'p', 'i', true, false, false, true, 0
 +};
 +
 +static const Form_pg_attribute SysAtt[] = {&a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8};
 +#else
  static const Form_pg_attribute SysAtt[] = {&a1, &a2, &a3, &a4, &a5, &a6, &a7};
 +#endif
  
  /*
   * This function returns a Form_pg_attribute pointer for a system attribute.
Simple merge
index 5caaef144f388f92ff94653569979c6336cbc6d7,2a33eb73fa9b276f67bf21d74183f85b930dadf0..d7f6075b13cf2cdb0de1c9c2702257a9b5099645
@@@ -9,8 -9,7 +9,8 @@@
   * and implementing search-path-controlled searches.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -205,26 -199,7 +207,10 @@@ static void RemoveTempRelationsCallback
  static void NamespaceCallback(Datum arg, int cacheid, uint32 hashvalue);
  static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames,
                           int **argnumbers);
 +#ifdef XCP
 +static void FindTemporaryNamespace(void);
 +#endif
  
- /* These don't really need to appear in any header file */
- Datum         pg_table_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_type_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_function_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_operator_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_opclass_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_opfamily_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_collation_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_conversion_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_parser_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_dict_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_template_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_ts_config_is_visible(PG_FUNCTION_ARGS);
- Datum         pg_my_temp_schema(PG_FUNCTION_ARGS);
- Datum         pg_is_other_temp_schema(PG_FUNCTION_ARGS);
  
  /*
   * RangeVarGetRelid
index 8068b82eab892078bd3d42db778cb8d8c9d0753b,6bc05cab3a28a990f4f46b93cd06478539f23bc1..6a365dceec1122adb11f394b6dc348e789b227f0
@@@ -3195,12 -3387,38 +3387,52 @@@ getObjectDescription(const ObjectAddres
                                break;
                        }
  
-               default:
-                       appendStringInfo(&buffer, "unrecognized object %u %u %d",
-                                                        object->classId,
-                                                        object->objectId,
-                                                        object->objectSubId);
-                       break;
+               case OCLASS_SUBSCRIPTION:
+                       {
+                               appendStringInfo(&buffer, _("subscription %s"),
+                                                                get_subscription_name(object->objectId));
+                               break;
+                       }
+               case OCLASS_TRANSFORM:
+                       {
+                               HeapTuple       trfTup;
+                               Form_pg_transform trfForm;
+                               trfTup = SearchSysCache1(TRFOID,
+                                                                                ObjectIdGetDatum(object->objectId));
+                               if (!HeapTupleIsValid(trfTup))
+                                       elog(ERROR, "could not find tuple for transform %u",
+                                                object->objectId);
+                               trfForm = (Form_pg_transform) GETSTRUCT(trfTup);
+                               appendStringInfo(&buffer, _("transform for %s language %s"),
+                                                                format_type_be(trfForm->trftype),
+                                                                get_language_name(trfForm->trflang, false));
+                               ReleaseSysCache(trfTup);
+                               break;
+                       }
++              case OCLASS_PGXC_NODE:
++                      {
++                              appendStringInfo(&buffer, _("node %s"),
++                                                               get_pgxc_nodename(object->objectId));
++                              break;
++                      }
++
++              case OCLASS_PGXC_GROUP:
++                      {
++                              appendStringInfo(&buffer, _("node group %s"),
++                                                               get_pgxc_groupname(object->objectId));
++                              break;
++                      }
++
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
        }
  
        return buffer.data;
@@@ -3676,13 -3915,10 +3929,22 @@@ getObjectTypeDescription(const ObjectAd
                        appendStringInfoString(&buffer, "transform");
                        break;
  
-               case OCLASS_AM:
-                       appendStringInfoString(&buffer, "access method");
++              case OCLASS_PGXC_CLASS:
++                      appendStringInfoString(&buffer, "pgxc_class");
 +                      break;
 +
-               default:
-                       appendStringInfo(&buffer, "unrecognized %u", object->classId);
++              case OCLASS_PGXC_NODE:
++                      appendStringInfoString(&buffer, "node");
++                      break;
++
++              case OCLASS_PGXC_GROUP:
++                      appendStringInfoString(&buffer, "node group");
 +                      break;
++
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
        }
  
        return buffer.data;
@@@ -4635,27 -4965,11 +4991,44 @@@ getObjectIdentityParts(const ObjectAddr
                                heap_close(transformDesc, AccessShareLock);
                        }
                        break;
-               case OCLASS_AM:
++              
++              case OCLASS_PGXC_CLASS:
++                      /* 
++                       * XXX PG10MERGE: ISTM that we don't record dependencies on
++                       * pgxc_class, pgxc_node and pgxc_group. So it's not clear if we
++                       * really need corresponding OCLASS_* either. We should check this
++                       * in more detail.
++                       */
++                      break;
 +
-                               char       *amname;
++              case OCLASS_PGXC_NODE:
 +                      {
-                               amname = get_am_name(object->objectId);
-                               if (!amname)
-                                       elog(ERROR, "cache lookup failed for access method %u",
-                                                object->objectId);
-                               appendStringInfoString(&buffer, quote_identifier(amname));
++                              char       *nodename;
 +
-                                       *objname = list_make1(amname);
++                              nodename = get_pgxc_nodename(object->objectId);
 +                              if (objname)
-                       break;
++                                      *objname = list_make1(nodename);
++                              appendStringInfoString(&buffer,
++                                                                         quote_identifier(nodename));
++                              break;
 +                      }
-               default:
-                       appendStringInfo(&buffer, "unrecognized object %u %u %d",
-                                                        object->classId,
-                                                        object->objectId,
-                                                        object->objectSubId);
-                       break;
 +
++              case OCLASS_PGXC_GROUP:
++                      {
++                              char       *groupname;
++
++                              groupname = get_pgxc_groupname(object->objectId);
++                              if (objname)
++                                      *objname = list_make1(groupname);
++                              appendStringInfoString(&buffer,
++                                                                         quote_identifier(groupname));
++                              break;
++                      }
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
        }
  
        /*
index 75621bd6e39d8e7f2ec6ec650f20b5f7c7787387,eaeabf13d68b23cc145f97156d1b10294d427ed4..0f7ab80f65a1a2e2ea2a2fbd4c10c98cdc2a2261
@@@ -3,8 -3,7 +3,8 @@@
   * pg_proc.c
   *      routines to support manipulation of the pg_proc relation
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "utils/acl.h"
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
+ #include "utils/regproc.h"
  #include "utils/rel.h"
  #include "utils/syscache.h"
 +#ifdef PGXC
 +#include "pgxc/execRemote.h"
 +#include "pgxc/pgxc.h"
 +#include "pgxc/planner.h"
 +#endif
  
  
- Datum         fmgr_internal_validator(PG_FUNCTION_ARGS);
- Datum         fmgr_c_validator(PG_FUNCTION_ARGS);
- Datum         fmgr_sql_validator(PG_FUNCTION_ARGS);
  typedef struct
  {
        char       *proname;
@@@ -940,17 -928,9 +934,17 @@@ fmgr_sql_validator(PG_FUNCTION_ARGS
                        querytree_list = NIL;
                        foreach(lc, raw_parsetree_list)
                        {
-                               Node       *parsetree = (Node *) lfirst(lc);
+                               RawStmt    *parsetree = lfirst_node(RawStmt, lc);
                                List       *querytree_sublist;
  
 +#ifdef PGXC
 +                              /* Block CTAS in SQL functions */
 +                              if (IsA(parsetree, CreateTableAsStmt))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                      errmsg("In XC, SQL functions cannot contain utility statements")));
 +#endif
 +
                                querytree_sublist = pg_analyze_and_rewrite_params(parsetree,
                                                                                                                                  prosrc,
                                                                           (ParserSetupHook) sql_fn_parser_setup,
index 297010be9f26b2216acaf2c222626a442e7e1018,0000000000000000000000000000000000000000..a35cf7866dd6b8bdd254a4bc5cec65a8a4c34dab
mode 100644,000000..100644
--- /dev/null
@@@ -1,211 -1,0 +1,208 @@@
-       (void) simple_heap_insert(pgxcclassrel, htup);
-       CatalogUpdateIndexes(pgxcclassrel, htup);
 +/*-------------------------------------------------------------------------
 + *
 + * pgxc_class.c
 + *    routines to support manipulation of the pgxc_class relation
 + *
 + * Copyright (c) 1996-2010, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + *
 + *-------------------------------------------------------------------------
 + */
 +#include "postgres.h"
 +
 +#include "access/heapam.h"
 +#include "access/htup_details.h"
 +#include "catalog/dependency.h"
 +#include "catalog/indexing.h"
 +#include "catalog/namespace.h"
 +#include "catalog/pg_type.h"
 +#include "catalog/pgxc_class.h"
 +#include "utils/builtins.h"
 +#include "utils/rel.h"
 +#include "utils/syscache.h"
 +#include "pgxc/locator.h"
 +#include "utils/array.h"
 +
 +/*
 + * PgxcClassCreate
 + *            Create a pgxc_class entry
 + */
 +void
 +PgxcClassCreate(Oid pcrelid,
 +                              char pclocatortype,
 +                              int pcattnum,
 +                              int pchashalgorithm,
 +                              int pchashbuckets,
 +                              int numnodes,
 +                              Oid *nodes)
 +{
 +      Relation        pgxcclassrel;
 +      HeapTuple       htup;
 +      bool            nulls[Natts_pgxc_class];
 +      Datum           values[Natts_pgxc_class];
 +      int             i;
 +      oidvector       *nodes_array;
 +
 +      /* Build array of Oids to be inserted */
 +      nodes_array = buildoidvector(nodes, numnodes);
 +
 +      /* Iterate through attributes initializing nulls and values */
 +      for (i = 0; i < Natts_pgxc_class; i++)
 +      {
 +              nulls[i]  = false;
 +              values[i] = (Datum) 0;
 +      }
 +
 +      /* should not happen */
 +      if (pcrelid == InvalidOid)
 +      {
 +              elog(ERROR,"pgxc class relid invalid.");
 +              return;
 +      }
 +
 +      values[Anum_pgxc_class_pcrelid - 1]   = ObjectIdGetDatum(pcrelid);
 +      values[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype);
 +
 +      if (pclocatortype == LOCATOR_TYPE_HASH || pclocatortype == LOCATOR_TYPE_MODULO)
 +      {
 +              values[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum);
 +              values[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm);
 +              values[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets);
 +      }
 +
 +      /* Node information */
 +      values[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array);
 +
 +      /* Open the relation for insertion */
 +      pgxcclassrel = heap_open(PgxcClassRelationId, RowExclusiveLock);
 +
 +      htup = heap_form_tuple(pgxcclassrel->rd_att, values, nulls);
 +
-       simple_heap_update(rel, &oldtup->t_self, newtup);
-       CatalogUpdateIndexes(rel, newtup);
++      CatalogTupleInsert(pgxcclassrel, htup);
 +
 +      heap_close(pgxcclassrel, RowExclusiveLock);
 +}
 +
 +
 +/*
 + * PgxcClassAlter
 + *            Modify a pgxc_class entry with given data
 + */
 +void
 +PgxcClassAlter(Oid pcrelid,
 +                         char pclocatortype,
 +                         int pcattnum,
 +                         int pchashalgorithm,
 +                         int pchashbuckets,
 +                         int numnodes,
 +                         Oid *nodes,
 +                         PgxcClassAlterType type)
 +{
 +      Relation        rel;
 +      HeapTuple       oldtup, newtup;
 +      oidvector  *nodes_array;
 +      Datum           new_record[Natts_pgxc_class];
 +      bool            new_record_nulls[Natts_pgxc_class];
 +      bool            new_record_repl[Natts_pgxc_class];
 +
 +      Assert(OidIsValid(pcrelid));
 +
 +      rel = heap_open(PgxcClassRelationId, RowExclusiveLock);
 +      oldtup = SearchSysCacheCopy1(PGXCCLASSRELID,
 +                                                               ObjectIdGetDatum(pcrelid));
 +
 +      if (!HeapTupleIsValid(oldtup)) /* should not happen */
 +              elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
 +
 +      /* Build array of Oids to be inserted */
 +      nodes_array = buildoidvector(nodes, numnodes);
 +
 +      /* Initialize fields */
 +      MemSet(new_record, 0, sizeof(new_record));
 +      MemSet(new_record_nulls, false, sizeof(new_record_nulls));
 +      MemSet(new_record_repl, false, sizeof(new_record_repl));
 +
 +      /* Fields are updated depending on operation type */
 +      switch (type)
 +      {
 +              case PGXC_CLASS_ALTER_DISTRIBUTION:
 +                      new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
 +                      break;
 +              case PGXC_CLASS_ALTER_NODES:
 +                      new_record_repl[Anum_pgxc_class_nodes - 1] = true;
 +                      break;
 +              case PGXC_CLASS_ALTER_ALL:
 +              default:
 +                      new_record_repl[Anum_pgxc_class_pcrelid - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
 +                      new_record_repl[Anum_pgxc_class_nodes - 1] = true;
 +      }
 +
 +      /* Set up new fields */
 +      /* Relation Oid */
 +      if (new_record_repl[Anum_pgxc_class_pcrelid - 1])
 +              new_record[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid);
 +
 +      /* Locator type */
 +      if (new_record_repl[Anum_pgxc_class_pclocatortype - 1])
 +              new_record[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype);
 +
 +      /* Attribute number of distribution column */
 +      if (new_record_repl[Anum_pgxc_class_pcattnum - 1])
 +              new_record[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum);
 +
 +      /* Hash algorithm type */
 +      if (new_record_repl[Anum_pgxc_class_pchashalgorithm - 1])
 +              new_record[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm);
 +
 +      /* Hash buckets */
 +      if (new_record_repl[Anum_pgxc_class_pchashbuckets - 1])
 +              new_record[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets);
 +
 +      /* Node information */
 +      if (new_record_repl[Anum_pgxc_class_nodes - 1])
 +              new_record[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array);
 +
 +      /* Update relation */
 +      newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel),
 +                                                         new_record,
 +                                                         new_record_nulls, new_record_repl);
++      CatalogTupleUpdate(rel, &oldtup->t_self, newtup);
 +
 +      heap_close(rel, RowExclusiveLock);
 +}
 +
 +/*
 + * RemovePGXCClass():
 + *            Remove extended PGXC information
 + */
 +void
 +RemovePgxcClass(Oid pcrelid)
 +{
 +      Relation  relation;
 +      HeapTuple tup;
 +
 +      /*
 +       * Delete the pgxc_class tuple.
 +       */
 +      relation = heap_open(PgxcClassRelationId, RowExclusiveLock);
 +      tup = SearchSysCache(PGXCCLASSRELID,
 +                                               ObjectIdGetDatum(pcrelid),
 +                                               0, 0, 0);
 +
 +      if (!HeapTupleIsValid(tup)) /* should not happen */
 +              elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
 +
 +      simple_heap_delete(relation, &tup->t_self);
 +
 +      ReleaseSysCache(tup);
 +
 +      heap_close(relation, RowExclusiveLock);
 +}
index a759e16c72df6ec6f217c12f68e0d8b9e76f1c09,f677916d0396f2f73ef77a63785e6f80e4172958..d5c4754d0168631f00ad28ce51162739a6d2d539
@@@ -3,8 -3,7 +3,8 @@@
   * storage.c
   *      code to create and destroy physical storage for relations
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 14aad4fd7c08274cf573e7a797adb77342dec0ef,ecdd8950ee02a547bf02acc2a74b85e8c685f707..67e4146c6cb7310ca93d828275ddcdda1941bd75
@@@ -3,8 -3,7 +3,8 @@@
   * analyze.c
   *      the Postgres statistics generator
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
Simple merge
index f45da2d9144bf375930e936dd117cce1f77ddb0c,1c17927c499d7d4a706f4578d9eb1b605b85066d..236b582f7c6dc3f8ef9e5b71632e3de68f45ec6e
@@@ -4,8 -4,7 +4,8 @@@
   *
   * PostgreSQL object comments utility code.
   *
-  * Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+  * Copyright (c) 1996-2017, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
   *      src/backend/commands/comment.c
index 461e94ed0bdd606a534b1ccf0797cb2636dbc627,84b1a54cb9b4ed81015ef96a30f7c01179750d99..5d5e409c7dcd789b0018d9e0fb2c5c081cdaad7e
@@@ -3,8 -3,7 +3,8 @@@
   * copy.c
   *            Implements the COPY utility command
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "miscadmin.h"
  #include "optimizer/clauses.h"
  #include "optimizer/planner.h"
 +#ifdef PGXC
 +#include "pgxc/pgxc.h"
 +#include "pgxc/execRemote.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/remotecopy.h"
 +#include "nodes/nodes.h"
 +#include "pgxc/poolmgr.h"
 +#include "catalog/pgxc_node.h"
 +#endif
  #include "nodes/makefuncs.h"
 +#include "optimizer/pgxcship.h"
+ #include "parser/parse_relation.h"
  #include "rewrite/rewriteHandler.h"
  #include "storage/fd.h"
  #include "tcop/tcopprot.h"
@@@ -79,10 -61,8 +80,11 @@@ typedef enum CopyDes
  {
        COPY_FILE,                                      /* to/from file (or a piped program) */
        COPY_OLD_FE,                            /* to/from frontend (2.0 protocol) */
-       COPY_NEW_FE                                     /* to/from frontend (3.0 protocol) */
 -      COPY_NEW_FE,                            /* to/from frontend (3.0 protocol) */
++      COPY_NEW_FE,                                    /* to/from frontend (3.0 protocol) */
 +#ifdef PGXC
-       ,COPY_BUFFER                            /* Do not send, just prepare */
++      COPY_BUFFER,                            /* Do not send, just prepare */
 +#endif
+       COPY_CALLBACK                           /* to/from callback function */
  } CopyDest;
  
  /*
@@@ -577,11 -531,9 +562,14 @@@ CopySendEndOfRow(CopyState cstate
                        /* Dump the accumulated row as one CopyData message */
                        (void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len);
                        break;
 +#ifdef PGXC
 +              case COPY_BUFFER:
 +                      /* Do not send yet anywhere, just return */
 +                      return;
 +#endif
+               case COPY_CALLBACK:
+                       Assert(false);          /* Not yet supported. */
+                       break;
        }
  
        resetStringInfo(fe_msgbuf);
@@@ -696,11 -648,9 +684,14 @@@ CopyGetData(CopyState cstate, void *dat
                                bytesread += avail;
                        }
                        break;
 +#ifdef PGXC
 +              case COPY_BUFFER:
 +                      elog(ERROR, "COPY_BUFFER not allowed in this context");
 +                      break;
 +#endif
+               case COPY_CALLBACK:
+                       bytesread = cstate->data_source_cb(databuf, minread, maxread);
+                       break;
        }
  
        return bytesread;
@@@ -976,19 -970,9 +1010,18 @@@ DoCopy(ParseState *pstate, const CopySt
                        PreventCommandIfReadOnly("COPY FROM");
                PreventCommandIfParallelMode("COPY FROM");
  
-               cstate = BeginCopyFrom(rel, stmt->filename, stmt->is_program,
-                                                          stmt->attlist, stmt->options);
-               cstate->range_table = range_table;
+               cstate = BeginCopyFrom(pstate, rel, stmt->filename, stmt->is_program,
+                                                          NULL, stmt->attlist, stmt->options);
                *processed = CopyFrom(cstate);  /* copy from file to database */
 +#ifdef XCP
 +              /*
 +               * We should record insert to distributed table.
 +               * Bulk inserts into local tables are recorded when heap tuples are
 +               * written.
 +               */
 +              if (IS_PGXC_COORDINATOR && rel->rd_locator_info)
 +                      pgstat_count_remote_insert(rel, (int) *processed);
 +#endif
                EndCopyFrom(cstate);
        }
        else
@@@ -1413,31 -1413,30 +1462,55 @@@ BeginCopy(ParseState *pstate
                                        (errcode(ERRCODE_UNDEFINED_COLUMN),
                                         errmsg("table \"%s\" does not have OIDs",
                                                        RelationGetRelationName(cstate->rel))));
+               /* Initialize state for CopyFrom tuple routing. */
+               if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+               {
+                       PartitionDispatch *partition_dispatch_info;
+                       ResultRelInfo *partitions;
+                       TupleConversionMap **partition_tupconv_maps;
+                       TupleTableSlot *partition_tuple_slot;
+                       int                     num_parted,
+                                               num_partitions;
+                       ExecSetupPartitionTupleRouting(rel,
+                                                                                  &partition_dispatch_info,
+                                                                                  &partitions,
+                                                                                  &partition_tupconv_maps,
+                                                                                  &partition_tuple_slot,
+                                                                                  &num_parted, &num_partitions);
+                       cstate->partition_dispatch_info = partition_dispatch_info;
+                       cstate->num_dispatch = num_parted;
+                       cstate->partitions = partitions;
+                       cstate->num_partitions = num_partitions;
+                       cstate->partition_tupconv_maps = partition_tupconv_maps;
+                       cstate->partition_tuple_slot = partition_tuple_slot;
+               }
 +#ifdef PGXC
 +              /* Get copy statement and execution node information */
 +              if (IS_PGXC_COORDINATOR)
 +              {
 +                      RemoteCopyData *remoteCopyState = (RemoteCopyData *) palloc0(sizeof(RemoteCopyData));
 +                      List *attnums = CopyGetAttnums(tupDesc, cstate->rel, attnamelist);
 +
 +                      /* Setup correct COPY FROM/TO flag */
 +                      remoteCopyState->is_from = is_from;
 +
 +                      /* Get execution node list */
 +                      RemoteCopy_GetRelationLoc(remoteCopyState,
 +                                                                        cstate->rel,
 +                                                                        attnums);
 +                      /* Build remote query */
 +                      RemoteCopy_BuildStatement(remoteCopyState,
 +                                                                        cstate->rel,
 +                                                                        GetRemoteCopyOptions(cstate),
 +                                                                        attnamelist,
 +                                                                        attnums);
 +
 +                      /* Then assign built structure */
 +                      cstate->remoteCopyState = remoteCopyState;
 +              }
 +#endif
        }
        else
        {
                                         errmsg("multi-statement DO INSTEAD rules are not supported for COPY")));
                }
  
-               query = (Query *) linitial(rewritten);
+               query = linitial_node(Query, rewritten);
  
 -              /* The grammar allows SELECT INTO, but we don't support that */
 -              if (query->utilityStmt != NULL &&
 -                      IsA(query->utilityStmt, CreateTableAsStmt))
 -                      ereport(ERROR,
 -                                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 -                                       errmsg("COPY (SELECT INTO) is not supported")));
 -
                Assert(query->utilityStmt == NULL);
  
                /*
@@@ -2698,10 -2700,13 +2887,16 @@@ CopyFrom(CopyState cstate
                         * tuples inserted by an INSERT command.
                         */
                        processed++;
+                       if (saved_resultRelInfo)
+                       {
+                               resultRelInfo = saved_resultRelInfo;
+                               estate->es_result_relation_info = resultRelInfo;
+                       }
                }
 +#ifdef PGXC
 +              }
 +#endif
        }
  
        /* Flush any remaining buffered tuples */
@@@ -3499,17 -3419,9 +3746,17 @@@ NextCopyFrom(CopyState cstate, ExprCont
                Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
  
                values[defmap[i]] = ExecEvalExpr(defexprs[i], econtext,
-                                                                                &nulls[defmap[i]], NULL);
+                                                                                &nulls[defmap[i]]);
        }
  
 +#ifdef PGXC
 +      if (IS_PGXC_COORDINATOR)
 +      {
 +              /* Append default values to the data-row in output format. */
 +              append_defvals(values, cstate);
 +      }
 +#endif
 +
        return true;
  }
  
index d87945e4d3802409fed7aa4c06f65f64fec030be,11038f6764c02656d193f6e14fc6a5232dd0c60e..baeb8b591e457c331af9220f0430a34d0b469069
@@@ -1571,14 -1473,10 +1606,15 @@@ AlterDatabase(ParseState *pstate, Alter
                        ereport(ERROR,
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                           errmsg("option \"%s\" cannot be specified with other options",
-                                         dtablespace->defname)));
+                                         dtablespace->defname),
+                                        parser_errposition(pstate, dtablespace->location)));
                /* this case isn't allowed within a transaction block */
 -              PreventTransactionChain(isTopLevel, "ALTER DATABASE SET TABLESPACE");
 +#ifdef PGXC
 +              /* ... but we allow it on remote nodes */
 +              if (IS_PGXC_LOCAL_COORDINATOR)
 +#endif
 +                      PreventTransactionChain(isTopLevel, "ALTER DATABASE SET TABLESPACE");
 +
                movedb(stmt->dbname, defGetString(dtablespace));
                return InvalidOid;
        }
index 0b58639229e9fde8501e9a729a2103b55abc3409,4cfab418a6f8cf91f55c2a9cfc6c92d5f02da285..51d8783fb6405aaf17fba9fbfc5093548713c9f1
@@@ -1168,17 -1178,21 +1178,26 @@@ EventTriggerSupportsObjectClass(ObjectC
                case OCLASS_USER_MAPPING:
                case OCLASS_DEFACL:
                case OCLASS_EXTENSION:
 +#ifdef PGXC
 +              case OCLASS_PGXC_CLASS:
 +              case OCLASS_PGXC_NODE:
 +              case OCLASS_PGXC_GROUP:
 +#endif
                case OCLASS_POLICY:
-               case OCLASS_AM:
+               case OCLASS_PUBLICATION:
+               case OCLASS_PUBLICATION_REL:
+               case OCLASS_SUBSCRIPTION:
+               case OCLASS_TRANSFORM:
                        return true;
+                       /*
+                        * There's intentionally no default: case here; we want the
+                        * compiler to warn if a new OCLASS hasn't been handled above.
+                        */
        }
  
-       return true;
+       /* Shouldn't get here, but if we do, say "no support" */
+       return false;
  }
  
  bool
index cdc0fe8f0c73323f1f9c305920dbc37fcfe4f1f2,9359d0a83ad7e19dd53674a186029307a25389c1..1bb5d7582ffbca144625b5bdf872a8ee732c5d04
@@@ -3,8 -3,7 +3,8 @@@
   * explain.c
   *      Explain query execution plans
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994-5, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -356,22 -346,8 +369,22 @@@ ExplainOneQuery(Query *query, int curso
        /* planner will not cope with utility statements */
        if (query->commandType == CMD_UTILITY)
        {
 -              ExplainOneUtility(query->utilityStmt, into, es, queryString, params,
 -                                                queryEnv);
 +              /*
 +               * If we are running EXPLAIN ANALYZE, transform the CTAS such that the
 +               * target table is created first and select result is inserted into the
 +               * table. The EXPLAIN ANALYZE would really just show the plan for the
 +               * INSERT INTO generated by QueryRewriteCTAS, but that's OK.
 +               */
 +              if (es->analyze && IsA(query->utilityStmt, CreateTableAsStmt))
 +              {
 +                      List *rewritten = QueryRewriteCTAS(query);
 +                      Assert(list_length(rewritten) == 1);
-                       ExplainOneQuery((Query *) linitial(rewritten), into, es,
-                                       queryString, params);
++                      ExplainOneQuery((Query *) linitial(rewritten), cursorOptions,
++                                      into, es, queryString, params, queryEnv);
 +              }
 +              else
 +                      ExplainOneUtility(query->utilityStmt, into, es,
-                                       queryString, params);
++                                      queryString, params, queryEnv);
                return;
        }
  
index be8641b5731358a4766a9c2929efb0bc1ba10dcf,c3718b08c1711f7f275b0c44cf6aee01ffe78f46..fa79e719553b91f5a53ee18f2fd123d5450597c3
@@@ -700,17 -712,20 +713,21 @@@ execute_sql_string(const char *sql, con
         * parsetree.  We must fully execute each query before beginning parse
         * analysis on the next one, since there may be interdependencies.
         */
 -      foreach(lc1, raw_parsetree_list)
 +      forboth(lc1, raw_parsetree_list, lc3, querysource_list)
        {
-               Node       *parsetree = (Node *) lfirst(lc1);
+               RawStmt    *parsetree = lfirst_node(RawStmt, lc1);
 +              char       *querysource = (char *) lfirst(lc3);
                List       *stmt_list;
                ListCell   *lc2;
  
+               /* Be sure parser can see any DDL done so far */
+               CommandCounterIncrement();
                stmt_list = pg_analyze_and_rewrite(parsetree,
 -                                                                                 sql,
 +                                                                                 querysource,
                                                                                   NULL,
-                                                                                  0);
+                                                                                  0,
+                                                                                  NULL);
                stmt_list = pg_plan_queries(stmt_list, CURSOR_OPT_PARALLEL_OK, NULL);
  
                foreach(lc2, stmt_list)
                        {
                                QueryDesc  *qdesc;
  
-                               qdesc = CreateQueryDesc((PlannedStmt *) stmt,
+                               qdesc = CreateQueryDesc(stmt,
 -                                                                              sql,
 +                                                                              querysource,
                                                                                GetActiveSnapshot(), NULL,
-                                                                               dest, NULL, 0);
+                                                                               dest, NULL, NULL, 0);
  
                                ExecutorStart(qdesc, 0);
-                               ExecutorRun(qdesc, ForwardScanDirection, 0);
+                               ExecutorRun(qdesc, ForwardScanDirection, 0, true);
                                ExecutorFinish(qdesc);
                                ExecutorEnd(qdesc);
  
                        }
                        else
                        {
+                               if (IsA(stmt->utilityStmt, TransactionStmt))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                                        errmsg("transaction control statements are not allowed within an extension script")));
                                ProcessUtility(stmt,
 -                                                         sql,
 +                                                         querysource,
                                                           PROCESS_UTILITY_QUERY,
                                                           NULL,
+                                                          NULL,
                                                           dest,
 +#ifdef PGXC
 +                                                         true,        /* this is created at remote node level */
 +#endif /* PGXC */
                                                           NULL);
                        }
  
@@@ -1363,11 -1440,8 +1445,8 @@@ CreateExtensionInternal(char *extension
                        csstmt->authrole = NULL;        /* will be created by current user */
                        csstmt->schemaElts = NIL;
                        csstmt->if_not_exists = false;
- #ifdef PGXC
-                       CreateSchemaCommand(csstmt, NULL, true);
- #else
-                       CreateSchemaCommand(csstmt, NULL);
- #endif
+                       CreateSchemaCommand(csstmt, "(generated CREATE SCHEMA command)",
 -                                                              -1, -1);
++                                                              true, -1, -1);
  
                        /*
                         * CreateSchemaCommand includes CommandCounterIncrement, so new
index 6963855373c7942a059f73e577e4503ef9eb0af6,ba85952baaef52b07463038c1d462337fb9ea983..554656b6ec9c976796f43df270634c60dcf46db3
@@@ -1593,15 -1612,19 +1612,22 @@@ ImportForeignSchema(ImportForeignSchema
                        /* Ensure creation schema is the one given in IMPORT statement */
                        cstmt->base.relation->schemaname = pstrdup(stmt->local_schema);
  
+                       /* No planning needed, just make a wrapper PlannedStmt */
+                       pstmt = makeNode(PlannedStmt);
+                       pstmt->commandType = CMD_UTILITY;
+                       pstmt->canSetTag = false;
+                       pstmt->utilityStmt = (Node *) cstmt;
+                       pstmt->stmt_location = rs->stmt_location;
+                       pstmt->stmt_len = rs->stmt_len;
                        /* Execute statement */
-                       ProcessUtility((Node *) cstmt,
+                       ProcessUtility(pstmt,
                                                   cmd,
 -                                                 PROCESS_UTILITY_SUBCOMMAND, NULL, NULL,
 -                                                 None_Receiver, NULL);
 +                                                 PROCESS_UTILITY_SUBCOMMAND, NULL,
++                                                 NULL,
 +                                                 None_Receiver,
- #ifdef XCP
 +                                                 false,
- #endif
 +                                                 NULL);
  
                        /* Be sure to advance the command counter between subcommands */
                        CommandCounterIncrement();
index 1587fb6e80522e6d0016551de4ee75d4ccd218ed,486179938c3e8537dc2cf631f4693a488168a53d..87ff7faf48b7c6ebaad7f772e718423f34a6c887
@@@ -3,10 -3,8 +3,10 @@@
   * indexcmds.c
   *      POSTGRES define and remove index code.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
index 442ee236a1b2014692d334ef588d36797582ec37,9ffd91ea0e3284df0e9e55468b78cde189eb7996..2061568d7f850a94d9df7cc5799d0e5e54040249
  #include "executor/executor.h"
  #include "executor/spi.h"
  #include "miscadmin.h"
 +#ifdef PGXC
 +#include "nodes/makefuncs.h"
 +#endif
  #include "parser/parse_relation.h"
+ #include "pgstat.h"
  #include "rewrite/rewriteHandler.h"
  #include "storage/lmgr.h"
  #include "storage/smgr.h"
index bbd5ca54dcff6a2da92a5d95274176f3e126c6c9,167910fcb5b140d4227c0a8a3d06b3d5b11ff60b..619145998200b671b5a957a00b69adcf894e54b1
@@@ -9,8 -9,7 +9,8 @@@
   * storage management for portals (but doesn't run any queries in them).
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -77,18 -96,9 +100,17 @@@ PerformCursorOpen(DeclareCursorStmt *cs
         */
        portal = CreatePortal(cstmt->portalname, false, false);
  
 +#ifdef PGXC
 +      /*
 +       * Consume the command id of the command creating the cursor
 +       */
 +      if (IS_PGXC_LOCAL_COORDINATOR)
 +              GetCurrentCommandId(true);
 +#endif
 +
        oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal));
  
-       stmt = copyObject(stmt);
-       stmt->utilityStmt = NULL;       /* make it look like plain SELECT */
+       plan = copyObject(plan);
  
        queryString = pstrdup(queryString);
  
index 316fc49109810936bdad9272e8ff197112001f1b,d265c77826f2b53b4c247901ad20d69dcead6302..287affa515011a5cbf43a3b0e53d3061c8101c3c
@@@ -89,10 -90,7 +105,10 @@@ PrepareQuery(PrepareStmt *stmt, const c
         * Create the CachedPlanSource before we do parse analysis, since it needs
         * to see the unmodified raw parse tree.
         */
-       plansource = CreateCachedPlan(stmt->query, queryString,
+       plansource = CreateCachedPlan(rawstmt, queryString,
 +#ifdef PGXC
 +                                                                stmt->name,
 +#endif
                                                                  CreateCommandTag(stmt->query));
  
        /* Transform list of TypeNames to array of type OIDs */
index 255ca89199d671c3cba92388de30f5f140a803ec,93425babbedb14b7a3c54e8c2d757210f9870a04..546b54bd9fb325d44f61d676c09e8f5891b000f7
@@@ -3,8 -3,7 +3,8 @@@
   * schemacmds.c
   *      schema creation/manipulation commands
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -45,13 -40,16 +45,17 @@@ static void AlterSchemaOwner_internal(H
  
  /*
   * CREATE SCHEMA
+  *
+  * Note: caller should pass in location information for the whole
+  * CREATE SCHEMA statement, which in turn we pass down as the location
+  * of the component commands.  This comports with our general plan of
+  * reporting location/len for the whole command even when executing
+  * a subquery.
   */
  Oid
- #ifdef PGXC
- CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString, bool sentToRemote)
- #else
- CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString)
- #endif
+ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString,
++                                      bool sentToRemote,
+                                       int stmt_location, int stmt_len)
  {
        const char *schemaName = stmt->schemaname;
        Oid                     namespaceId;
                                           queryString,
                                           PROCESS_UTILITY_SUBCOMMAND,
                                           NULL,
+                                          NULL,
                                           None_Receiver,
 +#ifdef PGXC
 +                                         true,
 +#endif /* PGXC */
                                           NULL);
                /* make sure later steps can see the object created here */
                CommandCounterIncrement();
        }
index e4a20294771d2d655c26758792264367ab0c5065,568b3022f2dfc7dcd5fcd2fa3691e8070705494f..0ccbe37a1b8e3d63134a61fcb6709a3aeefcf2e0
@@@ -3,10 -3,8 +3,10 @@@
   * sequence.c
   *      PostgreSQL sequences support code.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -15,7 -13,8 +15,9 @@@
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
++#include <math.h>
  
+ #include "access/bufmask.h"
  #include "access/htup_details.h"
  #include "access/multixact.h"
  #include "access/transam.h"
  #include "utils/lsyscache.h"
  #include "utils/resowner.h"
  #include "utils/syscache.h"
 +#include "commands/dbcommands.h"
 +
 +#ifdef PGXC
 +#include "pgxc/pgxc.h"
 +/* PGXC_COORD */
 +#include "access/gtm.h"
 +#include "utils/memutils.h"
 +#ifdef XCP
 +#include "utils/timestamp.h"
 +#endif
 +#endif
+ #include "utils/varlena.h"
  
 -
  /*
   * We don't want to log each fetching of a value from a sequence,
   * so we pre-log a few fetches in advance. In the event of
@@@ -92,11 -79,7 +98,11 @@@ typedef struct SeqTableDat
        int64           cached;                 /* last value already cached for nextval */
        /* if last != cached, we have not used up all the cached values */
        int64           increment;              /* copy of sequence's increment field */
-       /* note that increment is zero until we first do read_seq_tuple() */
+       /* note that increment is zero until we first do nextval_internal() */
 +#ifdef XCP
 +      TimestampTz last_call_time; /* the time when the last call as made */
 +      int64           range_multiplier; /* multiply this value with 2 next time */
 +#endif
  } SeqTableData;
  
  typedef SeqTableData *SeqTable;
@@@ -131,21 -93,19 +137,20 @@@ typedef struct rename_sequence_callback
  static SeqTableData *last_used_seq = NULL;
  
  static void fill_seq_with_data(Relation rel, HeapTuple tuple);
- static int64 nextval_internal(Oid relid);
- static Relation open_share_lock(SeqTable seq);
+ static Relation lock_and_open_sequence(SeqTable seq);
  static void create_seq_hashtable(void);
  static void init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel);
- static Form_pg_sequence read_seq_tuple(SeqTable elm, Relation rel,
-                          Buffer *buf, HeapTuple seqtuple);
- #ifdef PGXC
- static void init_params(List *options, bool isInit,
-                                               Form_pg_sequence new, List **owned_by, bool *is_restart);
- #else
- static void init_params(List *options, bool isInit,
-                       Form_pg_sequence new, List **owned_by);
- #endif
+ static Form_pg_sequence_data read_seq_tuple(Relation rel,
+                          Buffer *buf, HeapTuple seqdatatuple);
+ static LOCKMODE alter_sequence_get_lock_level(List *options);
+ static void init_params(ParseState *pstate, List *options, bool for_identity,
+                       bool isInit,
+                       Form_pg_sequence seqform,
+                       bool *changed_seqform,
 -                      Form_pg_sequence_data seqdataform, List **owned_by);
++                      Form_pg_sequence_data seqdataform, List **owned_by,
++                      bool *is_restart);
  static void do_setval(Oid relid, int64 next, bool iscalled);
- static void process_owned_by(Relation seqrel, List *owned_by);
+ static void process_owned_by(Relation seqrel, List *owned_by, bool for_identity);
  
  
  /*
@@@ -165,16 -127,9 +172,17 @@@ DefineSequence(ParseState *pstate, Crea
        TupleDesc       tupDesc;
        Datum           value[SEQ_COL_LASTCOL];
        bool            null[SEQ_COL_LASTCOL];
+       Datum           pgs_values[Natts_pg_sequence];
+       bool            pgs_nulls[Natts_pg_sequence];
        int                     i;
-       NameData        name;
 +#ifdef PGXC /* PGXC_COORD */
 +      GTM_Sequence    start_value = 1;
 +      GTM_Sequence    min_value = 1;
 +      GTM_Sequence    max_value = InvalidSequenceValue;
 +      GTM_Sequence    increment = 1;
 +      bool            cycle = false;
 +      bool            is_restart;
 +#endif
  
        /* Unlogged sequences are not implemented -- not clear if useful. */
        if (seq->sequence->relpersistence == RELPERSISTENCE_UNLOGGED)
        }
  
        /* Check and set all option values */
- #ifdef PGXC
-       init_params(seq->options, true, &new, &owned_by, &is_restart);
- #else
-       init_params(seq->options, true, &new, &owned_by);
- #endif
 -      init_params(pstate, seq->options, seq->for_identity, true, &seqform, &changed_seqform, &seqdataform, &owned_by);
++      init_params(pstate, seq->options, seq->for_identity, true, &seqform,
++                      &changed_seqform, &seqdataform, &owned_by, &is_restart);
  
        /*
         * Create relation (and fill value[] and null[] for the tuple)
  
        heap_close(rel, NoLock);
  
+       /* fill in pg_sequence */
+       rel = heap_open(SequenceRelationId, RowExclusiveLock);
+       tupDesc = RelationGetDescr(rel);
+       memset(pgs_nulls, 0, sizeof(pgs_nulls));
+       pgs_values[Anum_pg_sequence_seqrelid - 1] = ObjectIdGetDatum(seqoid);
+       pgs_values[Anum_pg_sequence_seqtypid - 1] = ObjectIdGetDatum(seqform.seqtypid);
+       pgs_values[Anum_pg_sequence_seqstart - 1] = Int64GetDatumFast(seqform.seqstart);
+       pgs_values[Anum_pg_sequence_seqincrement - 1] = Int64GetDatumFast(seqform.seqincrement);
+       pgs_values[Anum_pg_sequence_seqmax - 1] = Int64GetDatumFast(seqform.seqmax);
+       pgs_values[Anum_pg_sequence_seqmin - 1] = Int64GetDatumFast(seqform.seqmin);
+       pgs_values[Anum_pg_sequence_seqcache - 1] = Int64GetDatumFast(seqform.seqcache);
+       pgs_values[Anum_pg_sequence_seqcycle - 1] = BoolGetDatum(seqform.seqcycle);
+       tuple = heap_form_tuple(tupDesc, pgs_values, pgs_nulls);
+       CatalogTupleInsert(rel, tuple);
+       heap_freetuple(tuple);
+       heap_close(rel, RowExclusiveLock);
 +#ifdef PGXC  /* PGXC_COORD */
 +      /*
 +       * Remote Coordinator is in charge of creating sequence in GTM.
 +       * If sequence is temporary, it is not necessary to create it on GTM.
 +       */
 +      if (IS_PGXC_LOCAL_COORDINATOR)
 +      {
 +              char *seqname = GetGlobalSeqName(rel, NULL, NULL);
 +
 +              /* We also need to create it on the GTM */
 +              if (CreateSequenceGTM(seqname,
 +                                                        increment,
 +                                                        min_value,
 +                                                        max_value,
 +                              start_value, cycle) < 0)
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
 +                                       errmsg("GTM error, could not create sequence")));
 +              }
 +
 +
 +              pfree(seqname);
 +      }
 +#endif
        return address;
  }
  
@@@ -510,23 -417,20 +496,29 @@@ AlterSequence(ParseState *pstate, Alter
        SeqTable        elm;
        Relation        seqrel;
        Buffer          buf;
-       HeapTupleData seqtuple;
-       Form_pg_sequence seq;
-       FormData_pg_sequence new;
+       HeapTupleData seqdatatuple;
+       Form_pg_sequence seqform;
+       Form_pg_sequence_data seqdata;
+       FormData_pg_sequence_data newseqdata;
+       bool            changed_seqform = false;
        List       *owned_by;
 +#ifdef PGXC
 +      GTM_Sequence    start_value;
 +      GTM_Sequence    last_value;
 +      GTM_Sequence    min_value;
 +      GTM_Sequence    max_value;
 +      GTM_Sequence    increment;
 +      bool                    cycle;
 +      bool                    is_restart;
 +#endif
        ObjectAddress address;
+       Relation        rel;
+       HeapTuple       tuple;
  
        /* Open and lock sequence. */
-       relid = RangeVarGetRelid(stmt->sequence, AccessShareLock, stmt->missing_ok);
+       relid = RangeVarGetRelid(stmt->sequence,
+                                                        alter_sequence_get_lock_level(stmt->options),
+                                                        stmt->missing_ok);
        if (relid == InvalidOid)
        {
                ereport(NOTICE,
                aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
                                           stmt->sequence->relname);
  
-       /* lock page' buffer and read tuple into new sequence structure */
-       seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple);
+       rel = heap_open(SequenceRelationId, RowExclusiveLock);
+       tuple = SearchSysCacheCopy1(SEQRELID,
+                                                               ObjectIdGetDatum(relid));
+       if (!HeapTupleIsValid(tuple))
+               elog(ERROR, "cache lookup failed for sequence %u",
+                        relid);
+       seqform = (Form_pg_sequence) GETSTRUCT(tuple);
  
-       /* Copy old values of options into workspace */
-       memcpy(&new, seq, sizeof(FormData_pg_sequence));
+       /* lock page's buffer and read tuple into new sequence structure */
+       seqdata = read_seq_tuple(seqrel, &buf, &seqdatatuple);
+       /* Copy old sequence data into workspace */
+       memcpy(&newseqdata, seqdata, sizeof(FormData_pg_sequence_data));
  
        /* Check and set new values */
- #ifdef PGXC
-       init_params(stmt->options, false, &new, &owned_by, &is_restart);
- #else
-       init_params(stmt->options, false, &new, &owned_by);
- #endif
 -      init_params(pstate, stmt->options, stmt->for_identity, false, seqform, &changed_seqform, &newseqdata, &owned_by);
++      init_params(pstate, stmt->options, stmt->for_identity, false, seqform,
++                      &changed_seqform, &newseqdata, &owned_by, &is_restart);
  
        /* Clear local cache so that we don't think we have cached numbers */
        /* Note that we do not change the currval() state */
                GetTopTransactionId();
  
        /* Now okay to update the on-disk tuple */
-       increment = new.increment_by;
-       min_value = new.min_value;
-       max_value = new.max_value;
-       start_value = new.start_value;
-       last_value = new.last_value;
-       cycle = new.is_cycled;
 +#ifdef PGXC
++      increment = seqform->seqincrement;
++      min_value = seqform->seqmin;
++      max_value = seqform->seqmax;
++      start_value = seqform->seqstart;
++      last_value = elm->last;
++      cycle = seqform->seqcycle;
 +#endif
 +
        START_CRIT_SECTION();
  
-       memcpy(seq, &new, sizeof(FormData_pg_sequence));
+       memcpy(seqdata, &newseqdata, sizeof(FormData_pg_sequence_data));
  
        MarkBufferDirty(buf);
  
  
        ObjectAddressSet(address, RelationRelationId, relid);
  
+       if (changed_seqform)
+               CatalogTupleUpdate(rel, &tuple->t_self, tuple);
+       heap_close(rel, RowExclusiveLock);
        relation_close(seqrel, NoLock);
  
 +#ifdef PGXC
 +      /*
 +       * Remote Coordinator is in charge of create sequence in GTM
 +       * If sequence is temporary, no need to go through GTM.
 +       */
 +      if (IS_PGXC_LOCAL_COORDINATOR && seqrel->rd_backend != MyBackendId)
 +      {
 +              char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
 +
 +              /* We also need to create it on the GTM */
 +              if (AlterSequenceGTM(seqname,
 +                                                       increment,
 +                                                       min_value,
 +                                                       max_value,
 +                                                       start_value,
 +                                                       last_value,
 +                                                       cycle,
 +                                                       is_restart) < 0)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
 +                                       errmsg("GTM error, could not alter sequence")));
 +              pfree(seqname);
 +      }
 +#endif
        return address;
  }
  
@@@ -728,91 -629,24 +751,104 @@@ nextval_internal(Oid relid, bool check_
                return elm->last;
        }
  
+       pgstuple = SearchSysCache1(SEQRELID, ObjectIdGetDatum(relid));
+       if (!HeapTupleIsValid(pgstuple))
+               elog(ERROR, "cache lookup failed for sequence %u", relid);
+       pgsform = (Form_pg_sequence) GETSTRUCT(pgstuple);
+       incby = pgsform->seqincrement;
+       maxv = pgsform->seqmax;
+       minv = pgsform->seqmin;
+       cache = pgsform->seqcache;
+       cycle = pgsform->seqcycle;
+       ReleaseSysCache(pgstuple);
        /* lock page' buffer and read tuple */
-       seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple);
+       seq = read_seq_tuple(seqrel, &buf, &seqdatatuple);
        page = BufferGetPage(buf);
  
-               int64 range = seq->cache_value; /* how many values to ask from GTM? */
 +      {
-       fetch = cache = seq->cache_value;
++              int64 range = cache; /* how many values to ask from GTM? */
 +              int64 rangemax; /* the max value returned from the GTM for our request */
 +              char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
 +
 +              /*
 +               * Above, we still use the page as a locking mechanism to handle
 +               * concurrency
 +               *
 +               * If the user has set a CACHE parameter, we use that. Else we pass in
 +               * the SequenceRangeVal value
 +               */
 +              if (range == DEFAULT_CACHEVAL && SequenceRangeVal > range)
 +              {
 +                      TimestampTz curtime = GetCurrentTimestamp();
 +
 +                      if (!TimestampDifferenceExceeds(elm->last_call_time,
 +                                                                                                      curtime, 1000))
 +                      {
 +                              /*
 +                               * The previous GetNextValGTM call was made just a while back.
 +                               * Request double the range of what was requested in the
 +                               * earlier call. Honor the SequenceRangeVal boundary
 +                               * value to limit very large range requests!
 +                               */
 +                              elm->range_multiplier *= 2;
 +                              if (elm->range_multiplier < SequenceRangeVal)
 +                                      range = elm->range_multiplier;
 +                              else
 +                                      elm->range_multiplier = range = SequenceRangeVal;
 +
 +                              elog(DEBUG1, "increase sequence range %ld", range);
 +                      }
 +                      else if (TimestampDifferenceExceeds(elm->last_call_time,
 +                                                                                              curtime, 5000))
 +                      {
 +                              /* The previous GetNextValGTM call was pretty old */
 +                              range = elm->range_multiplier = DEFAULT_CACHEVAL;
 +                              elog(DEBUG1, "reset sequence range %ld", range);
 +                      }
 +                      else if (TimestampDifferenceExceeds(elm->last_call_time,
 +                                                                                              curtime, 3000))
 +                      {
 +                              /*
 +                               * The previous GetNextValGTM call was made quite some time
 +                               * ago. Try to reduce the range request to reduce the gap
 +                               */
 +                              if (elm->range_multiplier != DEFAULT_CACHEVAL)
 +                              {
 +                                      range = elm->range_multiplier =
 +                                                              rint(elm->range_multiplier/2);
 +                                      elog(DEBUG1, "decrease sequence range %ld", range);
 +                              }
 +                      }
 +                      else
 +                      {
 +                              /*
 +                               * Current range_multiplier alllows to cache sequence values
 +                               * for 1-3 seconds of work. Keep that rate.
 +                               */
 +                              range = elm->range_multiplier;
 +                      }
 +                      elm->last_call_time = curtime;
 +              }
 +
 +              result = (int64) GetNextValGTM(seqname, range, &rangemax);
 +              pfree(seqname);
 +
 +              /* Update the on-disk data */
 +              seq->last_value = result; /* last fetched number */
 +              seq->is_called = true;
 +
 +              /* save info in local cache */
 +              elm->last = result;                     /* last returned number */
 +              elm->cached = rangemax;         /* last fetched range max limit */
 +              elm->last_valid = true;
 +
 +              last_used_seq = elm;
 +      }
 +
+       elm->increment = incby;
+       last = next = result = seq->last_value;
+       fetch = cache;
        log = seq->log_cnt;
  
        if (!seq->is_called)
@@@ -1010,28 -963,50 +1172,59 @@@ do_setval(Oid relid, int64 next, bool i
                                                bufm, bufx)));
        }
  
 -      /* Set the currval() state only if iscalled = true */
 -      if (iscalled)
        {
 -              elm->last = next;               /* last returned number */
 -              elm->last_valid = true;
 -      }
 +              char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
  
 -              Page            page = BufferGetPage(buf);
 +              if (SetValGTM(seqname, next, iscalled) < 0)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
 +                                       errmsg("GTM error, could not obtain sequence value")));
 +              pfree(seqname);
 +              /* Update the on-disk data */
 +              seq->last_value = next; /* last fetched number */
 +              seq->is_called = iscalled;
 +              seq->log_cnt = (iscalled) ? 0 : 1;
 +
 +              if (iscalled)
 +              {
 +                      elm->last = next;               /* last returned number */
 +                      elm->last_valid = true;
 +              }
++      }
+       /* In any case, forget any future cached numbers */
+       elm->cached = elm->last;
+       /* check the comment above nextval_internal()'s equivalent call. */
+       if (RelationNeedsWAL(seqrel))
+               GetTopTransactionId();
+       /* ready to change the on-disk (or really, in-buffer) tuple */
+       START_CRIT_SECTION();
+       seq->last_value = next;         /* last fetched number */
+       seq->is_called = iscalled;
+       seq->log_cnt = 0;
+       MarkBufferDirty(buf);
+       /* XLOG stuff */
+       if (RelationNeedsWAL(seqrel))
+       {
+               xl_seq_rec      xlrec;
+               XLogRecPtr      recptr;
+               XLogBeginInsert();
+               XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
+               xlrec.node = seqrel->rd_node;
+               XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec));
+               XLogRegisterData((char *) seqdatatuple.t_data, seqdatatuple.t_len);
+               recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG);
  
 -              PageSetLSN(page, recptr);
 +              elm->cached = elm->last;
        }
  
 -      END_CRIT_SECTION();
 -
        UnlockReleaseBuffer(buf);
  
        relation_close(seqrel, NoLock);
@@@ -1157,11 -1132,7 +1350,11 @@@ init_sequence(Oid relid, SeqTable *p_el
                elm->filenode = InvalidOid;
                elm->lxid = InvalidLocalTransactionId;
                elm->last_valid = false;
-               elm->last = elm->cached = elm->increment = 0;
 +#ifdef XCP
 +              elm->last_call_time = 0;
 +              elm->range_multiplier = DEFAULT_CACHEVAL;
 +#endif
+               elm->last = elm->cached = 0;
        }
  
        /*
@@@ -1260,14 -1255,14 +1477,15 @@@ alter_sequence_get_lock_level(List *opt
   * otherwise, do not change existing options that aren't explicitly overridden.
   */
  static void
- #ifdef PGXC
- init_params(List *options, bool isInit,
-                       Form_pg_sequence new, List **owned_by, bool *is_restart)
- #else
- init_params(List *options, bool isInit,
-                       Form_pg_sequence new, List **owned_by)
- #endif
+ init_params(ParseState *pstate, List *options, bool for_identity,
+                       bool isInit,
+                       Form_pg_sequence seqform,
+                       bool *changed_seqform,
+                       Form_pg_sequence_data seqdataform,
 -                      List **owned_by)
++                      List **owned_by,
++                      bool *is_restart)
  {
+       DefElem    *as_type = NULL;
        DefElem    *start_value = NULL;
        DefElem    *restart_value = NULL;
        DefElem    *increment_by = NULL;
        DefElem    *cache_value = NULL;
        DefElem    *is_cycled = NULL;
        ListCell   *option;
+       bool            reset_max_value = false;
+       bool            reset_min_value = false;
  
 +#ifdef PGXC
 +      *is_restart = false;
 +#endif
 +
        *owned_by = NIL;
  
        foreach(option, options)
        if (restart_value != NULL)
        {
                if (restart_value->arg != NULL)
-                       new->last_value = defGetInt64(restart_value);
+                       seqdataform->last_value = defGetInt64(restart_value);
                else
-                       new->last_value = new->start_value;
+                       seqdataform->last_value = seqform->seqstart;
 +#ifdef PGXC
 +              *is_restart = true;
 +#endif
-               new->is_called = false;
-               new->log_cnt = 0;
++              seqdataform->last_value = seqform->seqstart;
+               seqdataform->is_called = false;
+               seqdataform->log_cnt = 0;
        }
        else if (isInit)
        {
                                         errmsg("CACHE (%s) must be greater than zero",
                                                        buf)));
                }
-               new->log_cnt = 0;
+               seqdataform->log_cnt = 0;
        }
        else if (isInit)
-               new->cache_value = 1;
+       {
+               seqform->seqcache = 1;
+               *changed_seqform = true;
+       }
  }
  
 +#ifdef PGXC
 +/*
 + * GetGlobalSeqName
 + *
 + * Returns a global sequence name adapted to GTM
 + * Name format is dbname.schemaname.seqname
 + * so as to identify in a unique way in the whole cluster each sequence
 + */
 +char *
 +GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schemaname)
 +{
 +      char *seqname, *dbname, *relname;
 +      char namespace[NAMEDATALEN * 2];
 +      int charlen;
 +      bool is_temp = seqrel->rd_backend == MyBackendId;
 +      /* Get all the necessary relation names */
 +      dbname = get_database_name(seqrel->rd_node.dbNode);
 +
 +      if (new_seqname)
 +              relname = (char *) new_seqname;
 +      else
 +              relname = RelationGetRelationName(seqrel);
 +
 +      if (!is_temp)
 +      {
 +              /*
 +               * For a permanent sequence, use schema qualified name. That can
 +               * uniquely identify the sequences.
 +               */
 +              char *schema = get_namespace_name(RelationGetNamespace(seqrel));
 +              sprintf(namespace, "%s", new_schemaname ? new_schemaname : schema);
 +              pfree(schema);
 +      }
 +      else
 +      {
 +              /*
 +               * For temporary sequences, we use originating coordinator name and
 +               * originating coordinator PID to qualify the sequence name. If we are
 +               * running on the local coordinator, we can readily fetch that
 +               * information from PGXCNodeName and MyProcPid, but when running on
 +               * remote datanode, we must consult MyCoordName and MyProcPid to get
 +               * the correct information.
 +               */
 +              if (IS_PGXC_LOCAL_COORDINATOR)
 +                      sprintf(namespace, "%s.%d", PGXCNodeName, MyProcPid);
 +              else
 +                      sprintf(namespace, "%s.%d", MyCoordName, MyCoordPid);
 +      }
 +
 +      /* Calculate the global name size including the dots and \0 */
 +      charlen = strlen(dbname) + strlen(namespace) + strlen(relname) + 3;
 +      seqname = (char *) palloc(charlen);
 +
 +      /* Form a unique sequence name with schema and database name for GTM */
 +      snprintf(seqname,
 +                       charlen,
 +                       "%s.%s.%s",
 +                       dbname,
 +                       namespace,
 +                       relname);
 +
 +      if (dbname)
 +              pfree(dbname);
 +
 +      return seqname;
 +}
 +
 +/*
 + * IsTempSequence
 + *
 + * Determine if given sequence is temporary or not.
 + */
 +bool
 +IsTempSequence(Oid relid)
 +{
 +      Relation seqrel;
 +      bool res;
 +      SeqTable        elm;
 +
 +      /* open and AccessShareLock sequence */
 +      init_sequence(relid, &elm, &seqrel);
 +
 +      res = seqrel->rd_backend == MyBackendId;
 +      relation_close(seqrel, NoLock);
 +      return res;
 +}
 +#endif
 +
  /*
   * Process an OWNED BY option for CREATE/ALTER SEQUENCE
   *
index b48f6e529dcb433b1a800b8a41383f420583c9fc,7959120f53eb3a17210a55a124f2f301982f3eee..5be449648ce1107d4371398a30922746e537298c
@@@ -3,10 -3,8 +3,10 @@@
   * tablecmds.c
   *      Commands for creating and altering table structures and settings
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -3553,16 -3709,12 +3888,22 @@@ ATPrepCmd(List **wqueue, Relation rel, 
                        /* No command-specific prep needed */
                        pass = AT_PASS_MISC;
                        break;
 +#ifdef PGXC
 +              case AT_DistributeBy:
 +              case AT_SubCluster:
 +              case AT_AddNodeList:
 +              case AT_DeleteNodeList:
 +                      ATSimplePermissions(rel, ATT_TABLE);
 +                      /* No command-specific prep needed */
 +                      pass = AT_PASS_DISTRIB;
 +                      break;
 +#endif
+               case AT_AttachPartition:
+               case AT_DetachPartition:
+                       ATSimplePermissions(rel, ATT_TABLE);
+                       /* No command-specific prep needed */
+                       pass = AT_PASS_MISC;
+                       break;
                default:                                /* oops */
                        elog(ERROR, "unrecognized alter table type: %d",
                                 (int) cmd->subtype);
@@@ -3882,20 -4050,12 +4239,24 @@@ ATExecCmd(List **wqueue, AlteredTableIn
                case AT_GenericOptions:
                        ATExecGenericOptions(rel, (List *) cmd->def);
                        break;
- #ifdef PGXC
 +              case AT_DistributeBy:
 +                      AtExecDistributeBy(rel, (DistributeBy *) cmd->def);
 +                      break;
 +              case AT_SubCluster:
 +                      AtExecSubCluster(rel, (PGXCSubCluster *) cmd->def);
 +                      break;
 +              case AT_AddNodeList:
 +                      AtExecAddNode(rel, (List *) cmd->def);
 +                      break;
 +              case AT_DeleteNodeList:
 +                      AtExecDeleteNode(rel, (List *) cmd->def);
 +                      break;
- #endif
+               case AT_AttachPartition:
+                       ATExecAttachPartition(wqueue, rel, (PartitionCmd *) cmd->def);
+                       break;
+               case AT_DetachPartition:
+                       ATExecDetachPartition(rel, ((PartitionCmd *) cmd->def)->name);
+                       break;
                default:                                /* oops */
                        elog(ERROR, "unrecognized alter table type: %d",
                                 (int) cmd->subtype);
@@@ -3927,18 -4087,9 +4288,19 @@@ ATRewriteTables(AlterTableStmt *parsetr
        {
                AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab);
  
-               /* Foreign tables have no storage. */
-               if (tab->relkind == RELKIND_FOREIGN_TABLE)
 +#ifdef PGXC
 +              /* Forbid table rewrite operations with online data redistribution */
 +              if (tab->rewrite &&
 +                      list_length(tab->subcmds[AT_PASS_DISTRIB]) > 0 &&
 +                      IS_PGXC_LOCAL_COORDINATOR)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
 +                                       errmsg("Incompatible operation with data redistribution")));
 +#endif
 +
+               /* Foreign tables have no storage, nor do partitioned tables. */
+               if (tab->relkind == RELKIND_FOREIGN_TABLE ||
+                       tab->relkind == RELKIND_PARTITIONED_TABLE)
                        continue;
  
                /*
@@@ -12448,13 -12800,7 +13502,13 @@@ PreCommit_on_commit_actions(void
                                 * relations, we can skip truncating ON COMMIT DELETE ROWS
                                 * tables, as they must still be empty.
                                 */
-                               if (MyXactAccessedTempRel)
 +#ifndef XCP
 +                              /*
 +                               * This optimization does not work in XL since temporary tables
 +                               * are handled differently in XL.
 +                               */
+                               if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL))
 +#endif
                                        oids_to_truncate = lappend_oid(oids_to_truncate, oc->relid);
                                break;
                        case ONCOMMIT_DROP:
@@@ -12770,119 -13117,727 +13825,844 @@@ RangeVarCallbackForAlterRelation(const 
        ReleaseSysCache(tuple);
  }
  
 +#ifdef PGXC
 +/*
 + * IsTempTable
 + *
 + * Check if given table Oid is temporary.
 + */
 +bool
 +IsTempTable(Oid relid)
 +{
 +      Relation        rel;
 +      bool            res;
 +      /*
 +       * PGXCTODO: Is it correct to open without locks?
 +       * we just check if this table is temporary though...
 +       */
 +      rel = relation_open(relid, NoLock);
 +      res = rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP;
 +      relation_close(rel, NoLock);
 +      return res;
 +}
 +
 +bool
 +IsLocalTempTable(Oid relid)
 +{
 +      Relation        rel;
 +      bool            res;
 +      rel = relation_open(relid, NoLock);
 +      res = (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
 +                      rel->rd_locator_info == NULL);
 +      relation_close(rel, NoLock);
 +      return res;
 +}
 +
 +/*
 + * IsIndexUsingTemp
 + *
 + * Check if given index relation uses temporary tables.
 + */
 +bool
 +IsIndexUsingTempTable(Oid relid)
 +{
 +      bool res = false;
 +      HeapTuple   tuple;
 +      Oid parent_id = InvalidOid;
 +
 +      tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(relid));
 +      if (HeapTupleIsValid(tuple))
 +      {
 +              Form_pg_index index = (Form_pg_index) GETSTRUCT(tuple);
 +              parent_id = index->indrelid;
 +
 +              /* Release system cache BEFORE looking at the parent table */
 +              ReleaseSysCache(tuple);
 +
 +              res = IsTempTable(parent_id);
 +      }
 +      else
 +              res = false; /* Default case */
 +
 +      return res;
 +}
 +
 +/*
 + * IsOnCommitActions
 + *
 + * Check if there are any on-commit actions activated.
 + */
 +bool
 +IsOnCommitActions(void)
 +{
 +      return list_length(on_commits) > 0;
 +}
 +
 +/*
 + * DropTableThrowErrorExternal
 + *
 + * Error interface for DROP when looking for execution node type.
 + */
 +void
 +DropTableThrowErrorExternal(RangeVar *relation, ObjectType removeType, bool missing_ok)
 +{
 +      char relkind;
 +
 +      /* Determine required relkind */
 +      switch (removeType)
 +      {
 +              case OBJECT_TABLE:
 +                      relkind = RELKIND_RELATION;
 +                      break;
 +
 +              case OBJECT_INDEX:
 +                      relkind = RELKIND_INDEX;
 +                      break;
 +
 +              case OBJECT_SEQUENCE:
 +                      relkind = RELKIND_SEQUENCE;
 +                      break;
 +
 +              case OBJECT_VIEW:
 +                      relkind = RELKIND_VIEW;
 +                      break;
 +
 +              case OBJECT_FOREIGN_TABLE:
 +                      relkind = RELKIND_FOREIGN_TABLE;
 +                      break;
 +
 +              default:
 +                      elog(ERROR, "unrecognized drop object type: %d",
 +                               (int) removeType);
 +                      relkind = 0;            /* keep compiler quiet */
 +                      break;
 +      }
 +
 +      DropErrorMsgNonExistent(relation, relkind, missing_ok);
 +}
 +#endif
++
+ /*
+  * Transform any expressions present in the partition key
+  *
+  * Returns a transformed PartitionSpec, as well as the strategy code
+  */
+ static PartitionSpec *
+ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy)
+ {
+       PartitionSpec *newspec;
+       ParseState *pstate;
+       RangeTblEntry *rte;
+       ListCell   *l;
+       newspec = makeNode(PartitionSpec);
+       newspec->strategy = partspec->strategy;
+       newspec->partParams = NIL;
+       newspec->location = partspec->location;
+       /* Parse partitioning strategy name */
+       if (pg_strcasecmp(partspec->strategy, "list") == 0)
+               *strategy = PARTITION_STRATEGY_LIST;
+       else if (pg_strcasecmp(partspec->strategy, "range") == 0)
+               *strategy = PARTITION_STRATEGY_RANGE;
+       else
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("unrecognized partitioning strategy \"%s\"",
+                                               partspec->strategy)));
+       /* Check valid number of columns for strategy */
+       if (*strategy == PARTITION_STRATEGY_LIST &&
+               list_length(partspec->partParams) != 1)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                errmsg("cannot use \"list\" partition strategy with more than one column")));
+       /*
+        * Create a dummy ParseState and insert the target relation as its sole
+        * rangetable entry.  We need a ParseState for transformExpr.
+        */
+       pstate = make_parsestate(NULL);
+       rte = addRangeTableEntryForRelation(pstate, rel, NULL, false, true);
+       addRTEtoQuery(pstate, rte, true, true, true);
+       /* take care of any partition expressions */
+       foreach(l, partspec->partParams)
+       {
+               PartitionElem *pelem = castNode(PartitionElem, lfirst(l));
+               ListCell   *lc;
+               /* Check for PARTITION BY ... (foo, foo) */
+               foreach(lc, newspec->partParams)
+               {
+                       PartitionElem *pparam = castNode(PartitionElem, lfirst(lc));
+                       if (pelem->name && pparam->name &&
+                               strcmp(pelem->name, pparam->name) == 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DUPLICATE_COLUMN),
+                                                errmsg("column \"%s\" appears more than once in partition key",
+                                                               pelem->name),
+                                                parser_errposition(pstate, pelem->location)));
+               }
+               if (pelem->expr)
+               {
+                       /* Copy, to avoid scribbling on the input */
+                       pelem = copyObject(pelem);
+                       /* Now do parse transformation of the expression */
+                       pelem->expr = transformExpr(pstate, pelem->expr,
+                                                                               EXPR_KIND_PARTITION_EXPRESSION);
+                       /* we have to fix its collations too */
+                       assign_expr_collations(pstate, pelem->expr);
+               }
+               newspec->partParams = lappend(newspec->partParams, pelem);
+       }
+       return newspec;
+ }
+ /*
+  * Compute per-partition-column information from a list of PartitionElems.
+  * Expressions in the PartitionElems must be parse-analyzed already.
+  */
+ static void
+ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
+                                         List **partexprs, Oid *partopclass, Oid *partcollation)
+ {
+       int                     attn;
+       ListCell   *lc;
+       attn = 0;
+       foreach(lc, partParams)
+       {
+               PartitionElem *pelem = castNode(PartitionElem, lfirst(lc));
+               Oid                     atttype;
+               Oid                     attcollation;
+               if (pelem->name != NULL)
+               {
+                       /* Simple attribute reference */
+                       HeapTuple       atttuple;
+                       Form_pg_attribute attform;
+                       atttuple = SearchSysCacheAttName(RelationGetRelid(rel),
+                                                                                        pelem->name);
+                       if (!HeapTupleIsValid(atttuple))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_UNDEFINED_COLUMN),
+                               errmsg("column \"%s\" named in partition key does not exist",
+                                          pelem->name)));
+                       attform = (Form_pg_attribute) GETSTRUCT(atttuple);
+                       if (attform->attnum <= 0)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                  errmsg("cannot use system column \"%s\" in partition key",
+                                                 pelem->name)));
+                       partattrs[attn] = attform->attnum;
+                       atttype = attform->atttypid;
+                       attcollation = attform->attcollation;
+                       ReleaseSysCache(atttuple);
+               }
+               else
+               {
+                       /* Expression */
+                       Node       *expr = pelem->expr;
+                       Assert(expr != NULL);
+                       atttype = exprType(expr);
+                       attcollation = exprCollation(expr);
+                       /*
+                        * Strip any top-level COLLATE clause.  This ensures that we treat
+                        * "x COLLATE y" and "(x COLLATE y)" alike.
+                        */
+                       while (IsA(expr, CollateExpr))
+                               expr = (Node *) ((CollateExpr *) expr)->arg;
+                       if (IsA(expr, Var) &&
+                               ((Var *) expr)->varattno > 0)
+                       {
+                               /*
+                                * User wrote "(column)" or "(column COLLATE something)".
+                                * Treat it like simple attribute anyway.
+                                */
+                               partattrs[attn] = ((Var *) expr)->varattno;
+                       }
+                       else
+                       {
+                               Bitmapset  *expr_attrs = NULL;
+                               int                     i;
+                               partattrs[attn] = 0;    /* marks the column as expression */
+                               *partexprs = lappend(*partexprs, expr);
+                               /*
+                                * Try to simplify the expression before checking for
+                                * mutability.  The main practical value of doing it in this
+                                * order is that an inline-able SQL-language function will be
+                                * accepted if its expansion is immutable, whether or not the
+                                * function itself is marked immutable.
+                                *
+                                * Note that expression_planner does not change the passed in
+                                * expression destructively and we have already saved the
+                                * expression to be stored into the catalog above.
+                                */
+                               expr = (Node *) expression_planner((Expr *) expr);
+                               /*
+                                * Partition expression cannot contain mutable functions,
+                                * because a given row must always map to the same partition
+                                * as long as there is no change in the partition boundary
+                                * structure.
+                                */
+                               if (contain_mutable_functions(expr))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("functions in partition key expression must be marked IMMUTABLE")));
+                               /*
+                                * transformPartitionSpec() should have already rejected
+                                * subqueries, aggregates, window functions, and SRFs, based
+                                * on the EXPR_KIND_ for partition expressions.
+                                */
+                               /*
+                                * Cannot have expressions containing whole-row references or
+                                * system column references.
+                                */
+                               pull_varattnos(expr, 1, &expr_attrs);
+                               if (bms_is_member(0 - FirstLowInvalidHeapAttributeNumber,
+                                                                 expr_attrs))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("partition key expressions cannot contain whole-row references")));
+                               for (i = FirstLowInvalidHeapAttributeNumber; i < 0; i++)
+                               {
+                                       if (bms_is_member(i - FirstLowInvalidHeapAttributeNumber,
+                                                                         expr_attrs))
+                                               ereport(ERROR,
+                                                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                                errmsg("partition key expressions cannot contain system column references")));
+                               }
+                               /*
+                                * While it is not exactly *wrong* for a partition expression
+                                * to be a constant, it seems better to reject such keys.
+                                */
+                               if (IsA(expr, Const))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("cannot use constant expression as partition key")));
+                       }
+               }
+               /*
+                * Apply collation override if any
+                */
+               if (pelem->collation)
+                       attcollation = get_collation_oid(pelem->collation, false);
+               /*
+                * Check we have a collation iff it's a collatable type.  The only
+                * expected failures here are (1) COLLATE applied to a noncollatable
+                * type, or (2) partition expression had an unresolved collation. But
+                * we might as well code this to be a complete consistency check.
+                */
+               if (type_is_collatable(atttype))
+               {
+                       if (!OidIsValid(attcollation))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INDETERMINATE_COLLATION),
+                                                errmsg("could not determine which collation to use for partition expression"),
+                                                errhint("Use the COLLATE clause to set the collation explicitly.")));
+               }
+               else
+               {
+                       if (OidIsValid(attcollation))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                                errmsg("collations are not supported by type %s",
+                                                               format_type_be(atttype))));
+               }
+               partcollation[attn] = attcollation;
+               /*
+                * Identify a btree opclass to use. Currently, we use only btree
+                * operators, which seems enough for list and range partitioning.
+                */
+               if (!pelem->opclass)
+               {
+                       partopclass[attn] = GetDefaultOpClass(atttype, BTREE_AM_OID);
+                       if (!OidIsValid(partopclass[attn]))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_UNDEFINED_OBJECT),
+                                  errmsg("data type %s has no default btree operator class",
+                                                 format_type_be(atttype)),
+                                                errhint("You must specify a btree operator class or define a default btree operator class for the data type.")));
+               }
+               else
+                       partopclass[attn] = ResolveOpClass(pelem->opclass,
+                                                                                          atttype,
+                                                                                          "btree",
+                                                                                          BTREE_AM_OID);
+               attn++;
+       }
+ }
+ /*
+  * ALTER TABLE <name> ATTACH PARTITION <partition-name> FOR VALUES
+  *
+  * Return the address of the newly attached partition.
+  */
+ static ObjectAddress
+ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
+ {
+       PartitionKey key = RelationGetPartitionKey(rel);
+       Relation        attachRel,
+                               catalog;
+       List       *childrels;
+       TupleConstr *attachRel_constr;
+       List       *partConstraint,
+                          *existConstraint;
+       SysScanDesc scan;
+       ScanKeyData skey;
+       AttrNumber      attno;
+       int                     natts;
+       TupleDesc       tupleDesc;
+       bool            skip_validate = false;
+       ObjectAddress address;
+       attachRel = heap_openrv(cmd->name, AccessExclusiveLock);
+       /*
+        * Must be owner of both parent and source table -- parent was checked by
+        * ATSimplePermissions call in ATPrepCmd
+        */
+       ATSimplePermissions(attachRel, ATT_TABLE | ATT_FOREIGN_TABLE);
+       /* A partition can only have one parent */
+       if (attachRel->rd_rel->relispartition)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("\"%s\" is already a partition",
+                                               RelationGetRelationName(attachRel))));
+       if (OidIsValid(attachRel->rd_rel->reloftype))
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach a typed table as partition")));
+       /*
+        * Table being attached should not already be part of inheritance; either
+        * as a child table...
+        */
+       catalog = heap_open(InheritsRelationId, AccessShareLock);
+       ScanKeyInit(&skey,
+                               Anum_pg_inherits_inhrelid,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(RelationGetRelid(attachRel)));
+       scan = systable_beginscan(catalog, InheritsRelidSeqnoIndexId, true,
+                                                         NULL, 1, &skey);
+       if (HeapTupleIsValid(systable_getnext(scan)))
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach inheritance child as partition")));
+       systable_endscan(scan);
+       /* ...or as a parent table (except the case when it is partitioned) */
+       ScanKeyInit(&skey,
+                               Anum_pg_inherits_inhparent,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(RelationGetRelid(attachRel)));
+       scan = systable_beginscan(catalog, InheritsParentIndexId, true, NULL,
+                                                         1, &skey);
+       if (HeapTupleIsValid(systable_getnext(scan)) &&
+               attachRel->rd_rel->relkind == RELKIND_RELATION)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach inheritance parent as partition")));
+       systable_endscan(scan);
+       heap_close(catalog, AccessShareLock);
+       /*
+        * Prevent circularity by seeing if rel is a partition of attachRel. (In
+        * particular, this disallows making a rel a partition of itself.)
+        */
+       childrels = find_all_inheritors(RelationGetRelid(attachRel),
+                                                                       AccessShareLock, NULL);
+       if (list_member_oid(childrels, RelationGetRelid(rel)))
+               ereport(ERROR,
+                               (errcode(ERRCODE_DUPLICATE_TABLE),
+                                errmsg("circular inheritance not allowed"),
+                                errdetail("\"%s\" is already a child of \"%s\".",
+                                                  RelationGetRelationName(rel),
+                                                  RelationGetRelationName(attachRel))));
+       /* Temp parent cannot have a partition that is itself not a temp */
+       if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+               attachRel->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach a permanent relation as partition of temporary relation \"%s\"",
+                                               RelationGetRelationName(rel))));
+       /* If the parent is temp, it must belong to this session */
+       if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+               !rel->rd_islocaltemp)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach as partition of temporary relation of another session")));
+       /* Ditto for the partition */
+       if (attachRel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+               !attachRel->rd_islocaltemp)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                                errmsg("cannot attach temporary relation of another session as partition")));
+       /* If parent has OIDs then child must have OIDs */
+       if (rel->rd_rel->relhasoids && !attachRel->rd_rel->relhasoids)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                        errmsg("cannot attach table \"%s\" without OIDs as partition of"
+                          " table \"%s\" with OIDs", RelationGetRelationName(attachRel),
+                                       RelationGetRelationName(rel))));
+       /* OTOH, if parent doesn't have them, do not allow in attachRel either */
+       if (attachRel->rd_rel->relhasoids && !rel->rd_rel->relhasoids)
+               ereport(ERROR,
+                               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                 errmsg("cannot attach table \"%s\" with OIDs as partition of table"
+                                " \"%s\" without OIDs", RelationGetRelationName(attachRel),
+                                RelationGetRelationName(rel))));
+       /* Check if there are any columns in attachRel that aren't in the parent */
+       tupleDesc = RelationGetDescr(attachRel);
+       natts = tupleDesc->natts;
+       for (attno = 1; attno <= natts; attno++)
+       {
+               Form_pg_attribute attribute = tupleDesc->attrs[attno - 1];
+               char       *attributeName = NameStr(attribute->attname);
+               /* Ignore dropped */
+               if (attribute->attisdropped)
+                       continue;
+               /* Try to find the column in parent (matching on column name) */
+               if (!SearchSysCacheExists2(ATTNAME,
+                                                                  ObjectIdGetDatum(RelationGetRelid(rel)),
+                                                                  CStringGetDatum(attributeName)))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                        errmsg("table \"%s\" contains column \"%s\" not found in parent \"%s\"",
+                                                       RelationGetRelationName(attachRel), attributeName,
+                                                       RelationGetRelationName(rel)),
+                                        errdetail("New partition should contain only the columns present in parent.")));
+       }
+       /* OK to create inheritance.  Rest of the checks performed there */
+       CreateInheritance(attachRel, rel);
+       /*
+        * Check that the new partition's bound is valid and does not overlap any
+        * of existing partitions of the parent - note that it does not return on
+        * error.
+        */
+       check_new_partition_bound(RelationGetRelationName(attachRel), rel,
+                                                         cmd->bound);
+       /* Update the pg_class entry. */
+       StorePartitionBound(attachRel, rel, cmd->bound);
+       /*
+        * Generate partition constraint from the partition bound specification.
+        * If the parent itself is a partition, make sure to include its
+        * constraint as well.
+        */
+       partConstraint = list_concat(get_qual_from_partbound(attachRel, rel,
+                                                                                                                cmd->bound),
+                                                                RelationGetPartitionQual(rel));
+       partConstraint = (List *) eval_const_expressions(NULL,
+                                                                                                        (Node *) partConstraint);
+       partConstraint = (List *) canonicalize_qual((Expr *) partConstraint);
+       partConstraint = list_make1(make_ands_explicit(partConstraint));
+       /*
+        * Check if we can do away with having to scan the table being attached to
+        * validate the partition constraint, by *proving* that the existing
+        * constraints of the table *imply* the partition predicate.  We include
+        * the table's check constraints and NOT NULL constraints in the list of
+        * clauses passed to predicate_implied_by().
+        *
+        * There is a case in which we cannot rely on just the result of the
+        * proof.
+        */
+       attachRel_constr = tupleDesc->constr;
+       existConstraint = NIL;
+       if (attachRel_constr != NULL)
+       {
+               int                     num_check = attachRel_constr->num_check;
+               int                     i;
+               Bitmapset  *not_null_attrs = NULL;
+               List       *part_constr;
+               ListCell   *lc;
+               bool            partition_accepts_null = true;
+               int                     partnatts;
+               if (attachRel_constr->has_not_null)
+               {
+                       int                     natts = attachRel->rd_att->natts;
+                       for (i = 1; i <= natts; i++)
+                       {
+                               Form_pg_attribute att = attachRel->rd_att->attrs[i - 1];
+                               if (att->attnotnull && !att->attisdropped)
+                               {
+                                       NullTest   *ntest = makeNode(NullTest);
+                                       ntest->arg = (Expr *) makeVar(1,
+                                                                                                 i,
+                                                                                                 att->atttypid,
+                                                                                                 att->atttypmod,
+                                                                                                 att->attcollation,
+                                                                                                 0);
+                                       ntest->nulltesttype = IS_NOT_NULL;
+                                       /*
+                                        * argisrow=false is correct even for a composite column,
+                                        * because attnotnull does not represent a SQL-spec IS NOT
+                                        * NULL test in such a case, just IS DISTINCT FROM NULL.
+                                        */
+                                       ntest->argisrow = false;
+                                       ntest->location = -1;
+                                       existConstraint = lappend(existConstraint, ntest);
+                                       not_null_attrs = bms_add_member(not_null_attrs, i);
+                               }
+                       }
+               }
+               for (i = 0; i < num_check; i++)
+               {
+                       Node       *cexpr;
+                       /*
+                        * If this constraint hasn't been fully validated yet, we must
+                        * ignore it here.
+                        */
+                       if (!attachRel_constr->check[i].ccvalid)
+                               continue;
+                       cexpr = stringToNode(attachRel_constr->check[i].ccbin);
+                       /*
+                        * Run each expression through const-simplification and
+                        * canonicalization.  It is necessary, because we will be
+                        * comparing it to similarly-processed qual clauses, and may fail
+                        * to detect valid matches without this.
+                        */
+                       cexpr = eval_const_expressions(NULL, cexpr);
+                       cexpr = (Node *) canonicalize_qual((Expr *) cexpr);
+                       existConstraint = list_concat(existConstraint,
+                                                                                 make_ands_implicit((Expr *) cexpr));
+               }
+               existConstraint = list_make1(make_ands_explicit(existConstraint));
+               /* And away we go ... */
+               if (predicate_implied_by(partConstraint, existConstraint))
+                       skip_validate = true;
+               /*
+                * We choose to err on the safer side, i.e., give up on skipping the
+                * validation scan, if the partition key column doesn't have the NOT
+                * NULL constraint and the table is to become a list partition that
+                * does not accept nulls.  In this case, the partition predicate
+                * (partConstraint) does include an 'key IS NOT NULL' expression,
+                * however, because of the way predicate_implied_by_simple_clause() is
+                * designed to handle IS NOT NULL predicates in the absence of a IS
+                * NOT NULL clause, we cannot rely on just the above proof.
+                *
+                * That is not an issue in case of a range partition, because if there
+                * were no NOT NULL constraint defined on the key columns, an error
+                * would be thrown before we get here anyway.  That is not true,
+                * however, if any of the partition keys is an expression, which is
+                * handled below.
+                */
+               part_constr = linitial(partConstraint);
+               part_constr = make_ands_implicit((Expr *) part_constr);
+               /*
+                * part_constr contains an IS NOT NULL expression, if this is a list
+                * partition that does not accept nulls (in fact, also if this is a
+                * range partition and some partition key is an expression, but we
+                * never skip validation in that case anyway; see below)
+                */
+               foreach(lc, part_constr)
+               {
+                       Node       *expr = lfirst(lc);
+                       if (IsA(expr, NullTest) &&
+                               ((NullTest *) expr)->nulltesttype == IS_NOT_NULL)
+                       {
+                               partition_accepts_null = false;
+                               break;
+                       }
+               }
+               partnatts = get_partition_natts(key);
+               for (i = 0; i < partnatts; i++)
+               {
+                       AttrNumber      partattno;
+                       partattno = get_partition_col_attnum(key, i);
+                       /* If partition key is an expression, must not skip validation */
+                       if (!partition_accepts_null &&
+                               (partattno == 0 ||
+                                !bms_is_member(partattno, not_null_attrs)))
+                               skip_validate = false;
+               }
+       }
+       /* It's safe to skip the validation scan after all */
+       if (skip_validate)
+               ereport(INFO,
+                               (errmsg("partition constraint for table \"%s\" is implied by existing constraints",
+                                               RelationGetRelationName(attachRel))));
+       /*
+        * Set up to have the table be scanned to validate the partition
+        * constraint (see partConstraint above).  If it's a partitioned table, we
+        * instead schedule its leaf partitions to be scanned.
+        */
+       if (!skip_validate)
+       {
+               List       *all_parts;
+               ListCell   *lc;
+               /* Take an exclusive lock on the partitions to be checked */
+               if (attachRel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+                       all_parts = find_all_inheritors(RelationGetRelid(attachRel),
+                                                                                       AccessExclusiveLock, NULL);
+               else
+                       all_parts = list_make1_oid(RelationGetRelid(attachRel));
+               foreach(lc, all_parts)
+               {
+                       AlteredTableInfo *tab;
+                       Oid                     part_relid = lfirst_oid(lc);
+                       Relation        part_rel;
+                       Expr       *constr;
+                       /* Lock already taken */
+                       if (part_relid != RelationGetRelid(attachRel))
+                               part_rel = heap_open(part_relid, NoLock);
+                       else
+                               part_rel = attachRel;
+                       /*
+                        * Skip if it's a partitioned table.  Only RELKIND_RELATION
+                        * relations (ie, leaf partitions) need to be scanned.
+                        */
+                       if (part_rel != attachRel &&
+                               part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+                       {
+                               heap_close(part_rel, NoLock);
+                               continue;
+                       }
+                       /* Grab a work queue entry */
+                       tab = ATGetQueueEntry(wqueue, part_rel);
+                       /* Adjust constraint to match this partition */
+                       constr = linitial(partConstraint);
+                       tab->partition_constraint = (Expr *)
+                               map_partition_varattnos((List *) constr, 1,
+                                                                               part_rel, rel);
+                       /* keep our lock until commit */
+                       if (part_rel != attachRel)
+                               heap_close(part_rel, NoLock);
+               }
+       }
+       ObjectAddressSet(address, RelationRelationId, RelationGetRelid(attachRel));
+       /* keep our lock until commit */
+       heap_close(attachRel, NoLock);
+       return address;
+ }
+ /*
+  * ALTER TABLE DETACH PARTITION
+  *
+  * Return the address of the relation that is no longer a partition of rel.
+  */
+ static ObjectAddress
+ ATExecDetachPartition(Relation rel, RangeVar *name)
+ {
+       Relation        partRel,
+                               classRel;
+       HeapTuple       tuple,
+                               newtuple;
+       Datum           new_val[Natts_pg_class];
+       bool            isnull,
+                               new_null[Natts_pg_class],
+                               new_repl[Natts_pg_class];
+       ObjectAddress address;
+       partRel = heap_openrv(name, AccessShareLock);
+       /* All inheritance related checks are performed within the function */
+       RemoveInheritance(partRel, rel);
+       /* Update pg_class tuple */
+       classRel = heap_open(RelationRelationId, RowExclusiveLock);
+       tuple = SearchSysCacheCopy1(RELOID,
+                                                               ObjectIdGetDatum(RelationGetRelid(partRel)));
+       Assert(((Form_pg_class) GETSTRUCT(tuple))->relispartition);
+       (void) SysCacheGetAttr(RELOID, tuple, Anum_pg_class_relpartbound,
+                                                  &isnull);
+       Assert(!isnull);
+       /* Clear relpartbound and reset relispartition */
+       memset(new_val, 0, sizeof(new_val));
+       memset(new_null, false, sizeof(new_null));
+       memset(new_repl, false, sizeof(new_repl));
+       new_val[Anum_pg_class_relpartbound - 1] = (Datum) 0;
+       new_null[Anum_pg_class_relpartbound - 1] = true;
+       new_repl[Anum_pg_class_relpartbound - 1] = true;
+       newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel),
+                                                                new_val, new_null, new_repl);
+       ((Form_pg_class) GETSTRUCT(newtuple))->relispartition = false;
+       CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple);
+       heap_freetuple(newtuple);
+       heap_close(classRel, RowExclusiveLock);
+       /*
+        * Invalidate the parent's relcache so that the partition is no longer
+        * included in its partition descriptor.
+        */
+       CacheInvalidateRelcache(rel);
+       ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partRel));
+       /* keep our lock until commit */
+       heap_close(partRel, NoLock);
+       return address;
+ }
index b43d61075a304b2da919b126cca58a5ca0f4a972,f9c26201d982d758b548f8ff7e15930e230d93bc..dc4d3ab02de69bbd7dd1097f6158304fdf19c2f1
  #include "utils/memutils.h"
  #include "utils/rel.h"
  #include "utils/tqual.h"
 +#ifdef PGXC
 +#include "pgxc/execRemote.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/pgxc.h"
 +#endif
+ #include "utils/varlena.h"
  
  
  /* GUC variables */
index 3dec3365567ed4d46543e17ee0dab56f9e2fbbf9,0271788bf9908c66041daf97a14a0e3baf5660a5..619d422e62797838ca704cd41f0f5d5602ae0e8f
@@@ -3,8 -3,7 +3,8 @@@
   * trigger.c
   *      PostgreSQL TRIGGERs support code.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -1068,15 -1189,18 +1196,20 @@@ ConvertTriggerToFK(CreateTrigStmt *stmt
                fkcon->skip_validation = false;
                fkcon->initially_valid = true;
  
+               /* finally, wrap it in a dummy PlannedStmt */
+               wrapper->commandType = CMD_UTILITY;
+               wrapper->canSetTag = false;
+               wrapper->utilityStmt = (Node *) atstmt;
+               wrapper->stmt_location = -1;
+               wrapper->stmt_len = -1;
                /* ... and execute it */
-               ProcessUtility((Node *) atstmt,
+               ProcessUtility(wrapper,
                                           "(generated ALTER TABLE ADD FOREIGN KEY command)",
-                                          PROCESS_UTILITY_SUBCOMMAND, NULL,
+                                          PROCESS_UTILITY_SUBCOMMAND, NULL, NULL,
 -                                         None_Receiver, NULL);
 +                                         None_Receiver,
- #ifdef PGXC
 +                                         false,
- #endif /* PGXC */
 +                                         NULL);
  
                /* Remove the matched item from the list */
                info_list = list_delete_ptr(info_list, info);
index 4181dfd167cb2fed45a88eb48a221ef461f68471,9fbb0eb4eb8e070943190d2bc8e003be47f8da50..24edf48b68ac61e852b93eb884dfc560aefe04de
@@@ -9,10 -9,8 +9,10 @@@
   * in cluster.c.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
index aafa7485957f788cf4555061a8799067e75fedfb,d75bddd87b26cad79ece963350cc2738e190d964..ed3b2484ae53c55ec153df7ab498eebc633ca60a
@@@ -4,8 -4,7 +4,8 @@@
   *            Routines for handling specialized SET variables.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index a809a203e9d0540aba18f8c4d55830d07187230b,a5d6574eaf37a5c370209821e95998c662c5a868..2ca5b5cfd2ef9c16fbe728670a456e92bbf6e1f2
@@@ -3,8 -3,7 +3,8 @@@
   * view.c
   *      use rewrite rules to construct views
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 6625d56b9715b5de4445edd0792bcc2b14f27a04,083b20f3fee0cb957f59d754e5128e153952a58b..fef60fb4c968ecc387e444de46f8db13025b5d54
@@@ -12,19 -12,23 +12,23 @@@ subdir = src/backend/executo
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
- OBJS = execAmi.o execCurrent.o execGrouping.o execIndexing.o execJunk.o \
-        execMain.o execParallel.o execProcnode.o execQual.o \
-        execScan.o execTuples.o \
+ OBJS = execAmi.o execCurrent.o execExpr.o execExprInterp.o \
+        execGrouping.o execIndexing.o execJunk.o \
+        execMain.o execParallel.o execProcnode.o \
+        execReplication.o execScan.o execSRF.o execTuples.o \
         execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \
         nodeBitmapAnd.o nodeBitmapOr.o \
-        nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeCustom.o nodeGather.o \
+        nodeBitmapHeapscan.o nodeBitmapIndexscan.o \
+        nodeCustom.o nodeFunctionscan.o nodeGather.o \
         nodeHash.o nodeHashjoin.o nodeIndexscan.o nodeIndexonlyscan.o \
-        nodeLimit.o nodeLockRows.o \
+        nodeLimit.o nodeLockRows.o nodeGatherMerge.o \
         nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \
-        nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \
+        nodeNestloop.o nodeProjectSet.o nodeRecursiveunion.o nodeResult.o \
         nodeSamplescan.o nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \
-        nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \
+        nodeValuesscan.o \
+        nodeCtescan.o nodeNamedtuplestorescan.o nodeWorktablescan.o \
         nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \
-        nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o tqueue.o spi.o
 -       nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o tqueue.o spi.o \
++       nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o tqueue.o spi.o \
+        nodeTableFuncscan.o
  
  include $(top_srcdir)/src/backend/common.mk
index 2cb83d75a7e71095dd9c4e8b4c43395ed9f8ed1c,7337d21d7d2f5b5980d269e69d9c3123356c87f4..b802ad6956d0dc3d74797872a1e96643315b2898
@@@ -3,8 -3,7 +3,8 @@@
   * execAmi.c
   *      miscellaneous executor access method routines
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *    src/backend/executor/execAmi.c
  #include "nodes/relation.h"
  #include "utils/rel.h"
  #include "utils/syscache.h"
 +#ifdef PGXC
 +#include "pgxc/execRemote.h"
 +#endif
  
  
- static bool TargetListSupportsBackwardScan(List *targetlist);
  static bool IndexSupportsBackwardScan(Oid indexid);
  
  
index 757ea8dddc9183eb7c27ce613466720ccb7a53d1,3af4a90b515e262193bf4cc0da93ddef33e4fb54..0224b9e4af11c434f5969d3edcd6551e1082c2b5
@@@ -3,8 -3,7 +3,8 @@@
   * execCurrent.c
   *      executor support for WHERE CURRENT OF cursor
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *    src/backend/executor/execCurrent.c
index 2eaa33455c9c5952e54f499ac14d8d62b45d5347,4a899f1eb567c74d2e8b73b2912b9b31b3c154d7..7232b0911f037a7d4dbbdc3f730f7b399b6734a1
@@@ -26,8 -26,7 +26,8 @@@
   *    before ExecutorEnd.  This can be omitted only in case of EXPLAIN,
   *    which should also omit ExecutorRun.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
  #include "utils/rls.h"
+ #include "utils/ruleutils.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
 -
 +#ifdef PGXC
 +#include "pgxc/pgxc.h"
 +#include "commands/copy.h"
 +#endif
 +#ifdef XCP
 +#include "access/gtm.h"
 +#include "pgxc/execRemote.h"
 +#include "pgxc/poolmgr.h"
 +#endif
  
  /* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
  ExecutorStart_hook_type ExecutorStart_hook = NULL;
@@@ -199,41 -193,16 +208,48 @@@ standard_ExecutorStart(QueryDesc *query
        estate->es_param_list_info = queryDesc->params;
  
        if (queryDesc->plannedstmt->nParamExec > 0)
 +#ifdef XCP
 +      {
 +              estate->es_param_exec_vals = (ParamExecData *)
 +                      palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
 +              if (queryDesc->plannedstmt->nParamRemote > 0)
 +              {
 +                      ParamListInfo extparams = estate->es_param_list_info;
 +                      int i = queryDesc->plannedstmt->nParamRemote;
 +                      while (--i >= 0 &&
 +                              queryDesc->plannedstmt->remoteparams[i].paramkind == PARAM_EXEC)
 +                      {
 +                              int paramno = queryDesc->plannedstmt->remoteparams[i].paramid;
 +                              ParamExecData *prmdata;
 +
 +                              Assert(paramno >= 0 &&
 +                                         paramno < queryDesc->plannedstmt->nParamExec);
 +                              prmdata = &(estate->es_param_exec_vals[paramno]);
 +                              prmdata->value = extparams->params[i].value;
 +                              prmdata->isnull = extparams->params[i].isnull;
 +                              prmdata->ptype = extparams->params[i].ptype;
 +                              prmdata->done = true;
 +                      }
 +                      /*
 +                       * Truncate exec parameters from the list of received parameters
 +                       * to avoid sending down duplicates if there are multiple levels
 +                       * of RemoteSubplan statements
 +                       */
 +                      extparams->numParams = i + 1;
 +              }
 +      }
 +#else
                estate->es_param_exec_vals = (ParamExecData *)
                        palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
 +#endif
  
+       estate->es_sourceText = queryDesc->sourceText;
+       /*
+        * Fill in the query environment, if any, from queryDesc.
+        */
+       estate->es_queryEnv = queryDesc->queryEnv;
        /*
         * If non-read-only query, set the command ID to mark output tuples with
         */
index fa7bdfc92363e6ea4f644ec7ef9a0909e4b71c85,5469cde1e00c25f7e90c1322d602009bda6b3780..f2d9ccb1309763cd3ba6a3fc6273a0e390b6b77d
@@@ -7,8 -7,7 +7,8 @@@
   *     ExecProcNode, or ExecEndNode on its subnodes and do the appropriate
   *     processing.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 63375dc82583ed199de698dbd91bfa0e50908d65,c4a955332f7c3d2b267287847b4b36fac7fa0add..489ca5edb97a1b4b39665b2fc27695ba3baee1e8
@@@ -12,8 -12,7 +12,8 @@@
   *      This information is needed by routines manipulating tuples
   *      (getattribute, formtuple, etc.).
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 4bdf76cf5620b6630ef2e0123979ef7d87fa721c,cb2596cb317e293cb525c7a4f0d5fb7c0f87b2e1..b1178552e526846b69b77ab7979563ea7f32f5d6
@@@ -3,8 -3,7 +3,8 @@@
   * execUtils.c
   *      miscellaneous executor utility routines
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 06b4a57656d4b1a1a27e0cfbde8a721d03d6d039,a35ba32e6dd973fcaa61f79563f2b19b9fb6711e..3f40fa65ef50241cd1b08c5e81e904645185e598
@@@ -70,9 -66,8 +70,9 @@@ typedef struct execution_stat
        ExecStatus      status;
        bool            setsResult;             /* true if this query produces func's result */
        bool            lazyEval;               /* true if should fetch one row at a time */
-       Node       *stmt;                       /* PlannedStmt or utility statement */
+       PlannedStmt *stmt;                      /* plan for this query */
        QueryDesc  *qd;                         /* null unless status == RUN */
 +      char            *src;                   /* source query resulting in this state */
  } execution_state;
  
  
@@@ -482,15 -475,14 +482,15 @@@ init_execution_state(List *queryTree_li
  {
        List       *eslist = NIL;
        execution_state *lasttages = NULL;
 -      ListCell   *lc1;
 +      ListCell   *lc1, *lc3;
  
 -      foreach(lc1, queryTree_list)
 +      forboth(lc1, queryTree_list, lc3, querySource_list)
        {
-               List       *qtlist = (List *) lfirst(lc1);
+               List       *qtlist = lfirst_node(List, lc1);
 +              char       *querysource = (char *) lfirst(lc3);
                execution_state *firstes = NULL;
                execution_state *preves = NULL;
 -              ListCell   *lc2;
 +              ListCell   *lc2, *lc4;
  
                foreach(lc2, qtlist)
                {
                                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                /* translator: %s is a SQL statement name */
                                           errmsg("%s is not allowed in a non-volatile function",
-                                                         CreateCommandTag(stmt))));
+                                                         CreateCommandTag((Node *) stmt))));
  
 +#ifdef PGXC
 +                      if (IS_PGXC_LOCAL_COORDINATOR)
 +                      {
 +                              if (queryTree->commandType != CMD_UTILITY)
 +                              {
 +                                      /*
 +                                      * The parameterised queries in RemoteQuery nodes will be prepared
 +                                      * on the Datanode, and need parameter types for the same. Set the
 +                                      * parameter types and their number in all RemoteQuery nodes in the
 +                                      * plan
 +                                      */
 +                                      SetRemoteStatementName(((PlannedStmt *)stmt)->planTree, NULL,
 +                                                                                      fcache->pinfo->nargs,
 +                                                                                      fcache->pinfo->argtypes, 0);
 +                              }
 +                      }
 +#endif /* PGXC */
 +
                        if (IsInParallelMode() && !CommandIsReadOnly(stmt))
-                               PreventCommandIfParallelMode(CreateCommandTag(stmt));
+                               PreventCommandIfParallelMode(CreateCommandTag((Node *) stmt));
  
                        /* OK, build the execution_state for this query */
                        newes = (execution_state *) palloc(sizeof(execution_state));
@@@ -723,16 -705,16 +733,17 @@@ init_sql_fcache(FmgrInfo *finfo, Oid co
  
        queryTree_list = NIL;
        flat_query_list = NIL;
 -      foreach(lc, raw_parsetree_list)
 +      forboth(lc, raw_parsetree_list, lc2, querysource_list)
        {
-               Node       *parsetree = (Node *) lfirst(lc);
+               RawStmt    *parsetree = lfirst_node(RawStmt, lc);
 +              char       *querysource = (char *) lfirst(lc2);
                List       *queryTree_sublist;
  
                queryTree_sublist = pg_analyze_and_rewrite_params(parsetree,
 -                                                                                                                fcache->src,
 +                                                                                                                querysource,
                                                                           (ParserSetupHook) sql_fn_parser_setup,
-                                                                                                                 fcache->pinfo);
+                                                                                                                 fcache->pinfo,
+                                                                                                                 NULL);
                queryTree_list = lappend(queryTree_list, queryTree_sublist);
                flat_query_list = list_concat(flat_query_list,
                                                                          list_copy(queryTree_sublist));
@@@ -824,22 -805,17 +835,17 @@@ postquel_start(execution_state *es, SQL
        else
                dest = None_Receiver;
  
-       if (IsA(es->stmt, PlannedStmt))
-               es->qd = CreateQueryDesc((PlannedStmt *) es->stmt,
-                                                                es->src,
-                                                                GetActiveSnapshot(),
-                                                                InvalidSnapshot,
-                                                                dest,
-                                                                fcache->paramLI, 0);
-       else
-               es->qd = CreateUtilityQueryDesc(es->stmt,
-                                                                               es->src,
-                                                                               GetActiveSnapshot(),
-                                                                               dest,
-                                                                               fcache->paramLI);
+       es->qd = CreateQueryDesc(es->stmt,
 -                                                       fcache->src,
++                                                       es->src,
+                                                        GetActiveSnapshot(),
+                                                        InvalidSnapshot,
+                                                        dest,
+                                                        fcache->paramLI,
+                                                        es->qd ? es->qd->queryEnv : NULL,
+                                                        0);
  
        /* Utility commands don't need Executor. */
-       if (es->qd->utilitystmt == NULL)
+       if (es->qd->operation != CMD_UTILITY)
        {
                /*
                 * In lazyEval mode, do not let the executor set up an AfterTrigger
@@@ -867,19 -843,14 +873,17 @@@ postquel_getnext(execution_state *es, S
  {
        bool            result;
  
-       if (es->qd->utilitystmt)
+       if (es->qd->operation == CMD_UTILITY)
        {
-               /* ProcessUtility needs the PlannedStmt for DECLARE CURSOR */
-               ProcessUtility((es->qd->plannedstmt ?
-                                               (Node *) es->qd->plannedstmt :
-                                               es->qd->utilitystmt),
+               ProcessUtility(es->qd->plannedstmt,
 -                                         fcache->src,
 +                                         es->src,
                                           PROCESS_UTILITY_QUERY,
                                           es->qd->params,
+                                          es->qd->queryEnv,
                                           es->qd->dest,
 +#ifdef PGXC
 +                                         false,
 +#endif /* PGXC */
                                           NULL);
                result = true;                  /* never stops early */
        }
index 8154522de410225979a69754d5bfed462d413ac2,7eeda95af752b992de48733b80faff09590cece4..1b94a664846a5f785478f024a853da94e0050337
   *      sensitive to the grouping set for which the aggregate function is
   *      currently being called.
   *
-  *      TODO: AGG_HASHED doesn't support multiple grouping sets yet.
+  *      Plan structure:
+  *
+  *      What we get from the planner is actually one "real" Agg node which is
+  *      part of the plan tree proper, but which optionally has an additional list
+  *      of Agg nodes hung off the side via the "chain" field.  This is because an
+  *      Agg node happens to be a convenient representation of all the data we
+  *      need for grouping sets.
+  *
+  *      For many purposes, we treat the "real" node as if it were just the first
+  *      node in the chain.  The chain must be ordered such that hashed entries
+  *      come before sorted/plain entries; the real node is marked AGG_MIXED if
+  *      there are both types present (in which case the real node describes one
+  *      of the hashed groupings, other AGG_HASHED nodes may optionally follow in
+  *      the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node).  If
+  *      the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
+  *      nodes must be of the same type; if it is AGG_PLAIN, there can be no
+  *      chained nodes.
+  *
+  *      We collect all hashed nodes into a single "phase", numbered 0, and create
+  *      a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
+  *      Phase 0 is allocated even if there are no hashes, but remains unused in
+  *      that case.
+  *
+  *      AGG_HASHED nodes actually refer to only a single grouping set each,
+  *      because for each hashed grouping we need a separate grpColIdx and
+  *      numGroups estimate.  AGG_SORTED nodes represent a "rollup", a list of
+  *      grouping sets that share a sort order.  Each AGG_SORTED node other than
+  *      the first one has an associated Sort node which describes the sort order
+  *      to be used; the first sorted node takes its input from the outer subtree,
+  *      which the planner has already arranged to provide ordered data.
+  *
+  *      Memory and ExprContext usage:
+  *
+  *      Because we're accumulating aggregate values across input rows, we need to
+  *      use more memory contexts than just simple input/output tuple contexts.
+  *      In fact, for a rollup, we need a separate context for each grouping set
+  *      so that we can reset the inner (finer-grained) aggregates on their group
+  *      boundaries while continuing to accumulate values for outer
+  *      (coarser-grained) groupings.  On top of this, we might be simultaneously
+  *      populating hashtables; however, we only need one context for all the
+  *      hashtables.
+  *
+  *      So we create an array, aggcontexts, with an ExprContext for each grouping
+  *      set in the largest rollup that we're going to process, and use the
+  *      per-tuple memory context of those ExprContexts to store the aggregate
+  *      transition values.  hashcontext is the single context created to support
+  *      all hash tables.
+  *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index 439e36ee3adda81e271f66bbba79c6d7468f6d19,cf555fe78d91b38f39f8f38594074bf55c4b7d00..0ee82e3add5c0e2d7afc9d5825cd1e2ee030ed05
@@@ -3,8 -3,7 +3,8 @@@
   * nodeModifyTable.c
   *      routines to handle ModifyTable nodes.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -258,7 -275,69 +276,68 @@@ ExecInsert(ModifyTableState *mtstate
         * get information on the (current) result relation
         */
        resultRelInfo = estate->es_result_relation_info;
+       /* Determine the partition to heap_insert the tuple into */
+       if (mtstate->mt_partition_dispatch_info)
+       {
+               int                     leaf_part_index;
+               TupleConversionMap *map;
+               /*
+                * Away we go ... If we end up not finding a partition after all,
+                * ExecFindPartition() does not return and errors out instead.
+                * Otherwise, the returned value is to be used as an index into arrays
+                * mt_partitions[] and mt_partition_tupconv_maps[] that will get us
+                * the ResultRelInfo and TupleConversionMap for the partition,
+                * respectively.
+                */
+               leaf_part_index = ExecFindPartition(resultRelInfo,
+                                                                                mtstate->mt_partition_dispatch_info,
+                                                                                       slot,
+                                                                                       estate);
+               Assert(leaf_part_index >= 0 &&
+                          leaf_part_index < mtstate->mt_num_partitions);
+               /*
+                * Save the old ResultRelInfo and switch to the one corresponding to
+                * the selected partition.
+                */
+               saved_resultRelInfo = resultRelInfo;
+               resultRelInfo = mtstate->mt_partitions + leaf_part_index;
+               /* We do not yet have a way to insert into a foreign partition */
+               if (resultRelInfo->ri_FdwRoutine)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("cannot route inserted tuples to a foreign table")));
+               /* For ExecInsertIndexTuples() to work on the partition's indexes */
+               estate->es_result_relation_info = resultRelInfo;
+               /*
+                * We might need to convert from the parent rowtype to the partition
+                * rowtype.
+                */
+               map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
+               if (map)
+               {
+                       Relation        partrel = resultRelInfo->ri_RelationDesc;
+                       tuple = do_convert_tuple(tuple, map);
+                       /*
+                        * We must use the partition's tuple descriptor from this point
+                        * on, until we're finished dealing with the partition. Use the
+                        * dedicated slot for that.
+                        */
+                       slot = mtstate->mt_partition_tuple_slot;
+                       Assert(slot != NULL);
+                       ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
+                       ExecStoreTuple(tuple, slot, InvalidBuffer, true);
+               }
+       }
        resultRelationDesc = resultRelInfo->ri_RelationDesc;
 -
        /*
         * If the result relation has OIDs, force the tuple's OID to zero so that
         * heap_insert will assign a fresh OID.  Usually the OID already will be
Simple merge
index fb7461c96c0c5a1ad0a899856be837772571f255,e8fa4c8547ce36d2a4f602ff7fa18581362f8ad5..85e5acf75a14aa1090f8595911033130a54a177b
@@@ -288,9 -274,7 +277,8 @@@ ExecScanSubPlan(SubPlanState *node
  
                prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
                                                                                           econtext,
-                                                                                          &(prm->isnull),
-                                                                                          NULL);
+                                                                                          &(prm->isnull));
 +              prm->done = true;
                planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
        }
  
@@@ -712,14 -697,9 +701,14 @@@ ExecInitSubPlan(SubPlan *subplan, PlanS
        /* ... and to its parent's state */
        sstate->parent = parent;
  
 +#ifdef XCP
 +      /* subplan is referenced on local node, finish initialization */
 +      ExecFinishInitProcNode(sstate->planstate);
 +#endif
 +
        /* Initialize subexpressions */
        sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent);
-       sstate->args = (List *) ExecInitExpr((Expr *) subplan->args, parent);
+       sstate->args = ExecInitExprList(subplan->args, parent);
  
        /*
         * initialize my state
@@@ -996,9 -952,7 +961,8 @@@ ExecSetParamPlan(SubPlanState *node, Ex
  
                prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
                                                                                           econtext,
-                                                                                          &(prm->isnull),
-                                                                                          NULL);
+                                                                                          &(prm->isnull));
 +              prm->done = true;
                planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
        }
  
index 0d512543d973df79a3893c8e6741d3690b1485eb,628bc9f00b70197c8c2f522970e6000a7bf512ab..124db68b1f5e76e4f64886eab91ae6afae88eb9d
@@@ -23,8 -23,7 +23,8 @@@
   * aggregate function over all rows in the current row's window frame.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index 0a3c65e6f7955093c3267c97762932b0032fc3d0,97c39258741f65cea88891f7b239070bcde6b854..ac0870c22d62f14a5456beaf69cd8b5cac25dfb2
@@@ -322,101 -299,6 +304,54 @@@ AtEOSubXact_SPI(bool isCommit, SubTrans
  }
  
  
- /* Pushes SPI stack to allow recursive SPI calls */
- void
- SPI_push(void)
- {
-       _SPI_curid++;
- }
- /* Pops SPI stack to allow recursive SPI calls */
- void
- SPI_pop(void)
- {
-       _SPI_curid--;
- }
- /* Conditional push: push only if we're inside a SPI procedure */
- bool
- SPI_push_conditional(void)
- {
-       bool            pushed = (_SPI_curid != _SPI_connected);
-       if (pushed)
-       {
-               _SPI_curid++;
-               /* We should now be in a state where SPI_connect would succeed */
-               Assert(_SPI_curid == _SPI_connected);
-       }
-       return pushed;
- }
- /* Conditional pop: pop only if SPI_push_conditional pushed */
- void
- SPI_pop_conditional(bool pushed)
- {
-       /* We should be in a state where SPI_connect would succeed */
-       Assert(_SPI_curid == _SPI_connected);
-       if (pushed)
-               _SPI_curid--;
- }
- /* Restore state of SPI stack after aborting a subtransaction */
- void
- SPI_restore_connection(void)
- {
-       Assert(_SPI_connected >= 0);
-       _SPI_curid = _SPI_connected - 1;
- }
 +#ifdef PGXC
 +/* SPI_execute_direct:
 + * Runs the 'remote_sql' query string on the node 'nodename'
 + * Create the ExecDirectStmt parse tree node using remote_sql, and then prepare
 + * and execute it using SPI interface.
 + * This function is essentially used for making internal exec-direct operations;
 + * and this should not require super-user privileges. We cannot run EXEC-DIRECT
 + * query because it is meant only for superusers. So this function needs to
 + * bypass the parse stage. This is achieved here by calling
 + * _SPI_pgxc_prepare_plan which accepts a parse tree.
 + */
 +int
 +SPI_execute_direct(const char *remote_sql, char *nodename)
 +{
 +      _SPI_plan       plan;
 +      int                     res;
 +      ExecDirectStmt *stmt = makeNode(ExecDirectStmt);
 +      StringInfoData execdirect;
 +
 +      initStringInfo(&execdirect);
 +
 +      /* This string is never used. It is just passed to fill up spierrcontext.arg */
 +      appendStringInfo(&execdirect, "EXECUTE DIRECT ON (%s) '%s'",
 +                                     nodename, remote_sql);
 +
 +      stmt->node_names = list_make1(makeString(nodename));
 +      stmt->query = strdup(remote_sql);
 +
 +      res = _SPI_begin_call(true);
 +      if (res < 0)
 +              return res;
 +
 +      memset(&plan, 0, sizeof(_SPI_plan));
 +      plan.magic = _SPI_PLAN_MAGIC;
 +      plan.cursor_options = 0;
 +
 +      /* Now pass the ExecDirectStmt parsetree node */
 +      _SPI_pgxc_prepare_plan(execdirect.data, list_make1(stmt),
 +                      list_make1(execdirect.data), &plan);
 +
 +      res = _SPI_execute_plan(&plan, NULL,
 +                                                      InvalidSnapshot, InvalidSnapshot, false, true, 0);
 +
 +      _SPI_end_call(true);
 +      return res;
 +}
 +#endif
 +
  /* Parse, plan, and execute a query string */
  int
  SPI_execute(const char *src, bool read_only, long tcount)
@@@ -1937,10 -1768,9 +1849,10 @@@ _SPI_pgxc_prepare_plan(const char *src
         */
        plancache_list = NIL;
  
 -      foreach(list_item, raw_parsetree_list)
 +      forboth(list_item, raw_parsetree_list, list_item2, querysource_list)
        {
-               Node       *parsetree = (Node *) lfirst(list_item);
+               RawStmt    *parsetree = lfirst_node(RawStmt, list_item);
 +              char       *querysource = (char *) lfirst (list_item2);
                List       *stmt_list;
                CachedPlanSource *plansource;
  
                 * needs to see the unmodified raw parse tree.
                 */
                plansource = CreateCachedPlan(parsetree,
 -                                                                        src,
 +                                                                        querysource,
 +#ifdef PGXC
 +                                                                        NULL,
 +#endif
-                                                                         CreateCommandTag(parsetree));
+                                                                         CreateCommandTag(parsetree->stmt));
  
                /*
                 * Parameter datatypes are driven by parserSetup hook if provided,
                {
                        Assert(plan->nargs == 0);
                        stmt_list = pg_analyze_and_rewrite_params(parsetree,
 -                                                                                                        src,
 +                                                                                                        querysource,
                                                                                                          plan->parserSetup,
-                                                                                                         plan->parserSetupArg);
+                                                                                                         plan->parserSetupArg,
+                                                                                                         _SPI_current->queryEnv);
                }
                else
                {
                        stmt_list = pg_analyze_and_rewrite(parsetree,
 -                                                                                         src,
 +                                                                                         querysource,
                                                                                           plan->argtypes,
-                                                                                          plan->nargs);
+                                                                                          plan->nargs,
+                                                                                          _SPI_current->queryEnv);
                }
  
                /* Finish filling in the CachedPlanSource */
@@@ -2044,15 -1872,14 +1958,16 @@@ _SPI_prepare_oneshot_plan(const char *s
         */
        plancache_list = NIL;
  
 -      foreach(list_item, raw_parsetree_list)
 +      forboth(list_item, raw_parsetree_list, list_item2, querysource_list)
        {
-               Node       *parsetree = (Node *) lfirst(list_item);
+               RawStmt    *parsetree = lfirst_node(RawStmt, list_item);
 +              char       *querysource = (char *) lfirst (list_item2);
                CachedPlanSource *plansource;
  
++
                plansource = CreateOneShotCachedPlan(parsetree,
 -                                                                                       src,
 +                                                                                       querysource,
-                                                                                        CreateCommandTag(parsetree));
+                                                                                 CreateCommandTag(parsetree->stmt));
  
                plancache_list = lappend(plancache_list, plansource);
        }
@@@ -2290,10 -2112,8 +2200,11 @@@ _SPI_execute_plan(SPIPlanPtr plan, Para
                                                           plansource->query_string,
                                                           PROCESS_UTILITY_QUERY,
                                                           paramLI,
+                                                          _SPI_current->queryEnv,
                                                           dest,
 +#ifdef PGXC
 +                                                         false,
 +#endif /* PGXC */
                                                           completionTag);
  
                                /* Update "processed" if stmt returned tuples */
index 75fc3b4845b464d15fd9ac746fb9ae52d599d2af,2cb60393852b93cda0a614bdb4dc69dc0c1aa47f..4773b18df966f68ef0a0f9b716ea5af62eda592e
@@@ -475,15 -403,8 +473,14 @@@ be_lowrite(PG_FUNCTION_ARGS
        int                     bytestowrite;
        int                     totalwritten;
  
-       bytestowrite = VARSIZE(wbuf) - VARHDRSZ;
-       totalwritten = lo_write(fd, VARDATA(wbuf), bytestowrite);
 +#ifdef PGXC
 +      ereport(ERROR,
 +                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                       errmsg("Postgres-XL does not yet support large objects"),
 +                       errdetail("The feature is not currently supported")));
 +#endif
+       bytestowrite = VARSIZE_ANY_EXHDR(wbuf);
+       totalwritten = lo_write(fd, VARDATA_ANY(wbuf), bytestowrite);
        PG_RETURN_INT32(totalwritten);
  }
  
Simple merge
Simple merge
index 0cdd6559d0f8f695eebea57908ab74395e55b6d8,36bf1dc92bbe3c90de58741c1a1d9c6c69d3c884..fc21909ea39dc34a688611ac9467e4d55529af8e
   * be handled easily in a simple depth-first traversal.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * IDENTIFICATION
   *      src/backend/nodes/copyfuncs.c
@@@ -106,16 -98,9 +107,19 @@@ _copyPlannedStmt(const PlannedStmt *fro
        COPY_NODE_FIELD(relationOids);
        COPY_NODE_FIELD(invalItems);
        COPY_SCALAR_FIELD(nParamExec);
 +#ifdef XCP
 +      COPY_SCALAR_FIELD(nParamRemote);
 +      COPY_POINTER_FIELD(remoteparams,
 +                                         newnode->nParamRemote * sizeof(RemoteParam));
 +      COPY_STRING_FIELD(pname);
 +      COPY_SCALAR_FIELD(distributionType);
 +      COPY_SCALAR_FIELD(distributionKey);
 +      COPY_NODE_FIELD(distributionNodes);
 +      COPY_NODE_FIELD(distributionRestrict);
 +#endif
+       COPY_NODE_FIELD(utilityStmt);
+       COPY_LOCATION_FIELD(stmt_location);
+       COPY_LOCATION_FIELD(stmt_len);
  
        return newnode;
  }
@@@ -4561,27 -4654,9 +4948,27 @@@ _copyForeignKeyCacheInfo(const ForeignK
        return newnode;
  }
  
 +/* ****************************************************************
 + *                                    poolutils.h copy functions
 + * ****************************************************************
 + */
 +static CleanConnStmt *
 +_copyCleanConnStmt(const CleanConnStmt *from)
 +{
 +      CleanConnStmt *newnode = makeNode(CleanConnStmt);
 +
 +      COPY_NODE_FIELD(nodes);
 +      COPY_STRING_FIELD(dbname);
 +      COPY_STRING_FIELD(username);
 +      COPY_SCALAR_FIELD(is_coord);
 +      COPY_SCALAR_FIELD(is_force);
 +
 +      return newnode;
 +}
 +#endif
  
  /*
-  * copyObject
+  * copyObjectImpl -- implementation of copyObject(); see nodes/nodes.h
   *
   * Create a copy of a Node tree or list.  This is a "deep" copy: all
   * substructure is copied too, recursively.
index e6f44f1cf833ec93074ccf11ec30392d11e31899,5bcf0317dc8fb01000634d2dbf9e56f669df3b9d..c644aba4c106efc3ec5add74730a9967c0024708
   * "x" to be considered equal() to another reference to "x" in the query.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * IDENTIFICATION
   *      src/backend/nodes/equalfuncs.c
Simple merge
index d05332c10abd3e250069da35e45d924cc69de7dc,41f3408cfcf9bf2de4baaf51558fe70bd9fc89e4..eb3e1ce1c141d5c10dc533b075cabab2647c3198
@@@ -3083,9 -3154,7 +3159,10 @@@ range_table_mutator(List *rtable
                                /* we don't bother to copy eref, aliases, etc; OK? */
                                break;
                        case RTE_CTE:
 +#ifdef PGXC
 +                      case RTE_REMOTE_DUMMY:
 +#endif /* PGXC */
+                       case RTE_NAMEDTUPLESTORE:
                                /* nothing to do */
                                break;
                        case RTE_SUBQUERY:
index 522387a3f8cf673831db10aefd26785b77b28e8c,9189c8d43f8716eac6398b79f1976cf34e3195c5..d5165ddd4eefd5a5483feebd652d2a1c87176b62
@@@ -3,8 -3,7 +3,8 @@@
   * outfuncs.c
   *      Output functions for Postgres tree nodes.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -134,125 -95,8 +134,125 @@@ set_portable_output(bool value
  /* Write a bitmapset field */
  #define WRITE_BITMAPSET_FIELD(fldname) \
        (appendStringInfo(str, " :" CppAsString(fldname) " "), \
-        _outBitmapset(str, node->fldname))
+        outBitmapset(str, node->fldname))
  
-       (_outToken(str, OidIsValid((relid)) ? NSP_NAME(get_rel_namespace((relid))) : NULL), \
 +#ifdef XCP
 +#define NSP_NAME(oid) \
 +      isTempNamespace(oid) ? "pg_temp" : get_namespace_name(oid)
 +/*
 + * Macros to encode OIDs to send to other nodes. Objects on other nodes may have
 + * different OIDs, so send instead an unique identifier allowing to lookup
 + * the OID on target node. The identifier depends on object type.
 + */
 +
 +#define WRITE_RELID_INTERNAL(relid) \
-        _outToken(str, OidIsValid((relid)) ? get_rel_name((relid)) : NULL))
++      (outToken(str, OidIsValid((relid)) ? NSP_NAME(get_rel_namespace((relid))) : NULL), \
 +       appendStringInfoChar(str, ' '), \
-        _outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \
++       outToken(str, OidIsValid((relid)) ? get_rel_name((relid)) : NULL))
 +
 +/* write an OID which is a relation OID */
 +#define WRITE_RELID_FIELD(fldname) \
 +      (appendStringInfo(str, " :" CppAsString(fldname) " "), \
 +       WRITE_RELID_INTERNAL(node->fldname))
 +
 +#define WRITE_RELID_LIST_FIELD(fldname) \
 +      do { \
 +              ListCell *lc; \
 +              char *sep = ""; \
 +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
 +              if (node->fldname == NIL || list_length(node->fldname) == 0) \
 +                      appendStringInfoString(str, "<>"); \
 +              else \
 +              { \
 +                      appendStringInfoChar(str, '('); \
 +                      foreach (lc, node->fldname) \
 +                      { \
 +                              Oid relid = lfirst_oid(lc); \
 +                              appendStringInfoString(str, sep); \
 +                              WRITE_RELID_INTERNAL(relid); \
 +                              sep = ","; \
 +                      } \
 +                      appendStringInfoChar(str, ')'); \
 +              } \
 +      }  while (0)
 +
 +/* write an OID which is a data type OID */
 +#define WRITE_TYPID_FIELD(fldname) \
 +      (appendStringInfo(str, " :" CppAsString(fldname) " "), \
-        _outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL))
++       outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \
 +       appendStringInfoChar(str, ' '), \
-                       _outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \
++       outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL))
 +
 +/* write an OID which is a function OID */
 +#define WRITE_FUNCID_FIELD(fldname) \
 +      do { \
 +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
 +              if (OidIsValid(node->fldname)) \
 +              { \
 +                      Oid *argtypes; \
 +                      int i, nargs; \
-                       _outToken(str, get_func_name(node->fldname)); \
++                      outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \
 +                      appendStringInfoChar(str, ' '); \
-                               _outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \
++                      outToken(str, get_func_name(node->fldname)); \
 +                      appendStringInfoChar(str, ' '); \
 +                      get_func_signature(node->fldname, &argtypes, &nargs); \
 +                      appendStringInfo(str, "%d", nargs); \
 +                      for (i = 0; i < nargs; i++) \
 +                      { \
 +                              appendStringInfoChar(str, ' '); \
-                               _outToken(str, get_typ_name(argtypes[i])); \
++                              outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \
 +                              appendStringInfoChar(str, ' '); \
-                       _outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \
++                              outToken(str, get_typ_name(argtypes[i])); \
 +                      } \
 +              } \
 +              else \
 +                      appendStringInfo(str, "<> <> 0"); \
 +      } while (0)
 +
 +/* write an OID which is an operator OID */
 +#define WRITE_OPERID_FIELD(fldname) \
 +      do { \
 +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
 +              if (OidIsValid(node->fldname)) \
 +              { \
 +                      Oid oprleft, oprright; \
-                       _outToken(str, get_opname(node->fldname)); \
++                      outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \
 +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, OidIsValid(oprleft) ? \
++                      outToken(str, get_opname(node->fldname)); \
 +                      appendStringInfoChar(str, ' '); \
 +                      op_input_types(node->fldname, &oprleft, &oprright); \
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \
++                      outToken(str, OidIsValid(oprleft) ? \
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL); \
 +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, OidIsValid(oprright) ? \
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \
 +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \
++                      outToken(str, OidIsValid(oprright) ? \
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL); \
 +                      appendStringInfoChar(str, ' '); \
-                       _outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \
 +                      appendStringInfoChar(str, ' '); \
 +              } \
 +              else \
 +                      appendStringInfo(str, "<> <> <> <> <> <>"); \
 +      } while (0)
 +
 +/* write an OID which is a collation OID */
 +#define WRITE_COLLID_FIELD(fldname) \
 +      do { \
 +              appendStringInfo(str, " :" CppAsString(fldname) " "); \
 +              if (OidIsValid(node->fldname)) \
 +              { \
-                       _outToken(str, get_collation_name(node->fldname)); \
++                      outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \
 +                      appendStringInfoChar(str, ' '); \
++                      outToken(str, get_collation_name(node->fldname)); \
 +                      appendStringInfo(str, " %d", get_collation_encoding(node->fldname)); \
 +              } \
 +              else \
 +                      appendStringInfo(str, "<> <> -1"); \
 +      } while (0)
 +
 +
 +#endif
  
  #define booltostr(x)  ((x) ? "true" : "false")
  
@@@ -402,48 -232,6 +388,48 @@@ outDatum(StringInfo str, Datum value, i
  }
  
  
-       _outToken(str, textvalue);
 +#ifdef XCP
 +/*
 + * Output value in text format
 + */
 +static void
 +_printDatum(StringInfo str, Datum value, Oid typid)
 +{
 +      Oid             typOutput;
 +      bool            typIsVarlena;
 +      FmgrInfo    finfo;
 +      Datum           tmpval;
 +      char       *textvalue;
 +      int                     saveDateStyle;
 +
 +      /* Get output function for the type */
 +      getTypeOutputInfo(typid, &typOutput, &typIsVarlena);
 +      fmgr_info(typOutput, &finfo);
 +
 +      /* Detoast value if needed */
 +      if (typIsVarlena)
 +              tmpval = PointerGetDatum(PG_DETOAST_DATUM(value));
 +      else
 +              tmpval = value;
 +
 +      /*
 +       * It was found that if configuration setting for date style is
 +       * "postgres,ymd" the output dates have format DD-MM-YYYY and they can not
 +       * be parsed correctly by receiving party. So force ISO format YYYY-MM-DD
 +       * in internal cluster communications, these values are always parsed
 +       * correctly.
 +       */
 +      saveDateStyle = DateStyle;
 +      DateStyle = USE_ISO_DATES;
 +
 +      textvalue = DatumGetCString(FunctionCall1(&finfo, tmpval));
++      outToken(str, textvalue);
 +
 +      DateStyle = saveDateStyle;
 +}
 +#endif
 +
 +
  /*
   *    Stuff from plannodes.h
   */
@@@ -601,52 -398,10 +605,52 @@@ _outMergeAppend(StringInfo str, const M
  
        appendStringInfoString(str, " :sortOperators");
        for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->sortOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Sort operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->sortOperators[i]);
  
        appendStringInfoString(str, " :collations");
        for (i = 0; i < node->numCols; i++)
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid coll = node->collations[i];
 +                      if (OidIsValid(coll))
 +                      {
 +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
 +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
 +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
 +                      }
 +                      else
 +                              appendStringInfo(str, " <> <> -1");
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->collations[i]);
  
        appendStringInfoString(str, " :nullsFirst");
@@@ -672,32 -427,6 +676,32 @@@ _outRecursiveUnion(StringInfo str, cons
  
        appendStringInfoString(str, " :dupOperators");
        for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->dupOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Unique operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +                      appendStringInfoChar(str, ' ');
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->dupOperators[i]);
  
        WRITE_LONG_FIELD(numGroups);
@@@ -851,12 -557,8 +885,13 @@@ _outBitmapIndexScan(StringInfo str, con
  
        _outScanInfo(str, (const Scan *) node);
  
 +#ifdef XCP
 +      if (portable_output)
 +              WRITE_RELID_FIELD(indexid);
 +      else
 +#endif
        WRITE_OID_FIELD(indexid);
+       WRITE_BOOL_FIELD(isshared);
        WRITE_NODE_FIELD(indexqual);
        WRITE_NODE_FIELD(indexqualorig);
  }
@@@ -1006,23 -729,6 +1062,23 @@@ _outMergeJoin(StringInfo str, const Mer
  
        appendStringInfoString(str, " :mergeCollations");
        for (i = 0; i < numCols; i++)
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid coll = node->mergeCollations[i];
 +                      if (OidIsValid(coll))
 +                      {
 +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
 +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
 +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
 +                      }
 +                      else
 +                              appendStringInfo(str, " <> <> -1");
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->mergeCollations[i]);
  
        appendStringInfoString(str, " :mergeStrategies");
@@@ -1063,32 -769,6 +1119,32 @@@ _outAgg(StringInfo str, const Agg *node
  
        appendStringInfoString(str, " :grpOperators");
        for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->grpOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Group operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +                      appendStringInfoChar(str, ' ');
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->grpOperators[i]);
  
        WRITE_LONG_FIELD(numGroups);
@@@ -1112,34 -793,8 +1169,34 @@@ _outWindowAgg(StringInfo str, const Win
        for (i = 0; i < node->partNumCols; i++)
                appendStringInfo(str, " %d", node->partColIdx[i]);
  
 -      appendStringInfoString(str, " :partOperations");
 +      appendStringInfoString(str, " :partOperators");
        for (i = 0; i < node->partNumCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->partOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* The operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +                      appendStringInfoChar(str, ' ');
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->partOperators[i]);
  
        WRITE_INT_FIELD(ordNumCols);
        for (i = 0; i < node->ordNumCols; i++)
                appendStringInfo(str, " %d", node->ordColIdx[i]);
  
 -      appendStringInfoString(str, " :ordOperations");
 +      appendStringInfoString(str, " :ordOperators");
        for (i = 0; i < node->ordNumCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->ordOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Group operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +                      appendStringInfoChar(str, ' ');
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->ordOperators[i]);
  
        WRITE_INT_FIELD(frameOptions);
@@@ -1200,32 -829,6 +1257,32 @@@ _outGroup(StringInfo str, const Group *
  
        appendStringInfoString(str, " :grpOperators");
        for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->grpOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Group operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +                      appendStringInfoChar(str, ' ');
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->grpOperators[i]);
  }
  
@@@ -1254,52 -857,10 +1311,52 @@@ _outSort(StringInfo str, const Sort *no
  
        appendStringInfoString(str, " :sortOperators");
        for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->sortOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Sort operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->sortOperators[i]);
  
        appendStringInfoString(str, " :collations");
        for (i = 0; i < node->numCols; i++)
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid coll = node->collations[i];
 +                      if (OidIsValid(coll))
 +                      {
 +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
 +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
 +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
 +                      }
 +                      else
 +                              appendStringInfo(str, " <> <> -1");
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->collations[i]);
  
        appendStringInfoString(str, " :nullsFirst");
@@@ -1324,32 -885,6 +1381,32 @@@ _outUnique(StringInfo str, const Uniqu
  
        appendStringInfoString(str, " :uniqOperators");
        for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->uniqOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Unique operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +                      appendStringInfoChar(str, ' ');
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->uniqOperators[i]);
  }
  
@@@ -1396,31 -919,6 +1446,31 @@@ _outSetOp(StringInfo str, const SetOp *
  
        appendStringInfoString(str, " :dupOperators");
        for (i = 0; i < node->numCols; i++)
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
 +#ifdef XCP
 +              if (portable_output)
 +              {
 +                      Oid oper = node->dupOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Unique operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +              }
 +              else
 +#endif
                appendStringInfo(str, " %u", node->dupOperators[i]);
  
        WRITE_INT_FIELD(flagColIdx);
@@@ -1450,138 -948,6 +1500,138 @@@ _outLimit(StringInfo str, const Limit *
        WRITE_NODE_FIELD(limitCount);
  }
  
-                       _outToken(str, NSP_NAME(get_typ_namespace(ptype)));
 +#ifdef XCP
 +static void
 +_outRemoteSubplan(StringInfo str, const RemoteSubplan *node)
 +{
 +      WRITE_NODE_TYPE("REMOTESUBPLAN");
 +
 +      _outScanInfo(str, (Scan *) node);
 +
 +      WRITE_CHAR_FIELD(distributionType);
 +      WRITE_INT_FIELD(distributionKey);
 +      WRITE_NODE_FIELD(distributionNodes);
 +      WRITE_NODE_FIELD(distributionRestrict);
 +      WRITE_NODE_FIELD(nodeList);
 +      WRITE_BOOL_FIELD(execOnAll);
 +      WRITE_NODE_FIELD(sort);
 +      WRITE_STRING_FIELD(cursor);
 +      WRITE_INT_FIELD(unique);
 +}
 +
 +static void
 +_outRemoteStmt(StringInfo str, const RemoteStmt *node)
 +{
 +      int i;
 +
 +      WRITE_NODE_TYPE("REMOTESTMT");
 +
 +      WRITE_ENUM_FIELD(commandType, CmdType);
 +      WRITE_BOOL_FIELD(hasReturning);
 +      WRITE_NODE_FIELD(planTree);
 +      WRITE_NODE_FIELD(rtable);
 +      WRITE_NODE_FIELD(resultRelations);
 +      WRITE_NODE_FIELD(subplans);
 +      WRITE_INT_FIELD(nParamExec);
 +      WRITE_INT_FIELD(nParamRemote);
 +
 +      for (i = 0; i < node->nParamRemote; i++)
 +      {
 +              RemoteParam *rparam = &(node->remoteparams[i]);
 +              appendStringInfo(str, " :paramkind");
 +              appendStringInfo(str, " %d", (int) rparam->paramkind);
 +
 +              appendStringInfo(str, " :paramid");
 +              appendStringInfo(str, " %d", rparam->paramid);
 +
 +              appendStringInfo(str, " :paramused");
 +              appendStringInfo(str, " %d", rparam->paramused);
 +
 +              appendStringInfo(str, " :paramtype");
 +              if (portable_output)
 +              {
 +                      Oid ptype = rparam->paramtype;
 +                      Assert(OidIsValid(ptype));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_typ_name(ptype));
++                      outToken(str, NSP_NAME(get_typ_namespace(ptype)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, NSP_NAME(get_opnamespace(oper)));
++                      outToken(str, get_typ_name(ptype));
 +              }
 +              else
 +                      appendStringInfo(str, " %u", rparam->paramtype);
 +      }
 +      WRITE_NODE_FIELD(rowMarks);
 +      WRITE_CHAR_FIELD(distributionType);
 +      WRITE_INT_FIELD(distributionKey);
 +      WRITE_NODE_FIELD(distributionNodes);
 +      WRITE_NODE_FIELD(distributionRestrict);
 +}
 +
 +static void
 +_outSimpleSort(StringInfo str, const SimpleSort *node)
 +{
 +      int                     i;
 +
 +      WRITE_NODE_TYPE("SIMPLESORT");
 +
 +      WRITE_INT_FIELD(numCols);
 +
 +      appendStringInfo(str, " :sortColIdx");
 +      for (i = 0; i < node->numCols; i++)
 +              appendStringInfo(str, " %d", node->sortColIdx[i]);
 +
 +      appendStringInfo(str, " :sortOperators");
 +      for (i = 0; i < node->numCols; i++)
 +              if (portable_output)
 +              {
 +                      Oid oper = node->sortOperators[i];
 +                      Oid oprleft, oprright;
 +                      /* Sort operator is always valid */
 +                      Assert(OidIsValid(oper));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, get_opname(oper));
++                      outToken(str, NSP_NAME(get_opnamespace(oper)));
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprleft) ?
++                      outToken(str, get_opname(oper));
 +                      appendStringInfoChar(str, ' ');
 +                      op_input_types(oper, &oprleft, &oprright);
-                       _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++                      outToken(str, OidIsValid(oprleft) ?
 +                                      NSP_NAME(get_typ_namespace(oprleft)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ?
++                      outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                       _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++                      outToken(str, OidIsValid(oprright) ?
 +                                      NSP_NAME(get_typ_namespace(oprright)) : NULL);
 +                      appendStringInfoChar(str, ' ');
-                               _outToken(str, NSP_NAME(get_collation_namespace(coll)));
++                      outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
 +              }
 +              else
 +                      appendStringInfo(str, " %u", node->sortOperators[i]);
 +
 +      appendStringInfo(str, " :sortCollations");
 +      for (i = 0; i < node->numCols; i++)
 +              if (portable_output)
 +              {
 +                      Oid coll = node->sortCollations[i];
 +                      if (OidIsValid(coll))
 +                      {
 +                              appendStringInfoChar(str, ' ');
-                               _outToken(str, get_collation_name(coll));
++                              outToken(str, NSP_NAME(get_collation_namespace(coll)));
 +                              appendStringInfoChar(str, ' ');
++                              outToken(str, get_collation_name(coll));
 +                              appendStringInfo(str, " %d", get_collation_encoding(coll));
 +                      }
 +                      else
 +                              appendStringInfo(str, " <> <> -1");
 +              }
 +              else
 +                      appendStringInfo(str, " %u", node->sortCollations[i]);
 +
 +      appendStringInfo(str, " :nullsFirst");
 +      for (i = 0; i < node->numCols; i++)
 +              appendStringInfo(str, " %s", booltostr(node->nullsFirst[i]));
 +}
 +#endif
 +
  static void
  _outNestLoopParam(StringInfo str, const NestLoopParam *node)
  {
@@@ -3924,15 -3041,17 +4126,22 @@@ _outRangeTblEntry(StringInfo str, cons
                        WRITE_STRING_FIELD(ctename);
                        WRITE_UINT_FIELD(ctelevelsup);
                        WRITE_BOOL_FIELD(self_reference);
-                       WRITE_NODE_FIELD(ctecoltypes);
-                       WRITE_NODE_FIELD(ctecoltypmods);
-                       WRITE_NODE_FIELD(ctecolcollations);
+                       WRITE_NODE_FIELD(coltypes);
+                       WRITE_NODE_FIELD(coltypmods);
+                       WRITE_NODE_FIELD(colcollations);
+                       break;
+               case RTE_NAMEDTUPLESTORE:
+                       WRITE_STRING_FIELD(enrname);
+                       WRITE_OID_FIELD(relid);
+                       WRITE_NODE_FIELD(coltypes);
+                       WRITE_NODE_FIELD(coltypmods);
+                       WRITE_NODE_FIELD(colcollations);
                        break;
 +#ifdef PGXC
 +              case RTE_REMOTE_DUMMY:
 +                      /* Everything relevant already copied */
 +                      break;
 +#endif /* PGXC */
                default:
                        elog(ERROR, "unrecognized RTE kind: %d", (int) node->rtekind);
                        break;
@@@ -4976,11 -4183,21 +5306,26 @@@ outNode(StringInfo str, const void *obj
                        case T_ForeignKeyCacheInfo:
                                _outForeignKeyCacheInfo(str, obj);
                                break;
 +#ifdef PGXC
 +                      case T_ExecNodes:
 +                              _outExecNodes(str, obj);
 +                              break;
 +#endif
+                       case T_TriggerTransition:
+                               _outTriggerTransition(str, obj);
+                               break;
+                       case T_PartitionElem:
+                               _outPartitionElem(str, obj);
+                               break;
+                       case T_PartitionSpec:
+                               _outPartitionSpec(str, obj);
+                               break;
+                       case T_PartitionBoundSpec:
+                               _outPartitionBoundSpec(str, obj);
+                               break;
+                       case T_PartitionRangeDatum:
+                               _outPartitionRangeDatum(str, obj);
+                               break;
  
                        default:
  
Simple merge
index 933825cd74cc6980bc2b4f046e4469da8a9012fc,b59ebd63ecb379bd16bb0b10bbb5b8ffd214c6a1..23091c2bcca7539a3689807b097f6b45580fbb00
@@@ -3,10 -3,8 +3,10 @@@
   * readfuncs.c
   *      Reader functions for Postgres tree nodes.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -1991,15 -1352,17 +2040,22 @@@ _readRangeTblEntry(void
                        READ_STRING_FIELD(ctename);
                        READ_UINT_FIELD(ctelevelsup);
                        READ_BOOL_FIELD(self_reference);
-                       READ_NODE_FIELD(ctecoltypes);
-                       READ_NODE_FIELD(ctecoltypmods);
-                       READ_NODE_FIELD(ctecolcollations);
+                       READ_NODE_FIELD(coltypes);
+                       READ_NODE_FIELD(coltypmods);
+                       READ_NODE_FIELD(colcollations);
+                       break;
+               case RTE_NAMEDTUPLESTORE:
+                       READ_STRING_FIELD(enrname);
+                       READ_OID_FIELD(relid);
+                       READ_NODE_FIELD(coltypes);
+                       READ_NODE_FIELD(coltypmods);
+                       READ_NODE_FIELD(colcollations);
                        break;
 +#ifdef PGXC
 +              case RTE_REMOTE_DUMMY:
 +                      /* Nothing to do */
 +                      break;
 +#endif /* PGXC */
                default:
                        elog(ERROR, "unrecognized RTE kind: %d",
                                 (int) local_node->rtekind);
@@@ -2470,10 -1759,8 +2549,11 @@@ _readBitmapIndexScan(void
  
        ReadCommonScan(&local_node->scan);
  
 -      READ_OID_FIELD(indexid);
 +      if (portable_input)
 +              READ_RELID_FIELD(indexid);
 +      else
 +              READ_OID_FIELD(indexid);
+       READ_BOOL_FIELD(isshared);
        READ_NODE_FIELD(indexqual);
        READ_NODE_FIELD(indexqualorig);
  
@@@ -2949,58 -2087,9 +3046,59 @@@ _readAgg(void
        READ_ENUM_FIELD(aggsplit, AggSplit);
        READ_INT_FIELD(numCols);
        READ_ATTRNUMBER_ARRAY(grpColIdx, local_node->numCols);
 +
 +#ifdef PGXC
 +      token = pg_strtok(&length);             /* skip :grpOperators */
 +      local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->grpOperators[i] = get_operid(oprname,
 +                                                                                                       oprleft,
 +                                                                                                       oprright,
 +                                                                                                       NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->grpOperators[i] = atooid(token);
 +      }
 +#else
        READ_OID_ARRAY(grpOperators, local_node->numCols);
 +#endif
 +
        READ_LONG_FIELD(numGroups);
+       READ_BITMAPSET_FIELD(aggParams);
        READ_NODE_FIELD(groupingSets);
        READ_NODE_FIELD(chain);
  
@@@ -3227,17 -2184,9 +3345,12 @@@ _readHash(void
  
        ReadCommonPlan(&local_node->plan);
  
 -      READ_OID_FIELD(skewTable);
 +      if (portable_input)
 +              READ_RELID_FIELD(skewTable);
 +      else
 +              READ_OID_FIELD(skewTable);
        READ_INT_FIELD(skewColumn);
        READ_BOOL_FIELD(skewInherit);
-       if (portable_input)
-               READ_TYPID_FIELD(skewColType);
-       else
-               READ_OID_FIELD(skewColType);
-       READ_INT_FIELD(skewColTypmod);
  
        READ_DONE();
  }
@@@ -3423,17 -2306,12 +3536,18 @@@ _readSubPlan(void
        READ_NODE_FIELD(paramIds);
        READ_INT_FIELD(plan_id);
        READ_STRING_FIELD(plan_name);
 -      READ_OID_FIELD(firstColType);
 +      if (portable_input)
 +              READ_TYPID_FIELD(firstColType);
 +      else
 +              READ_OID_FIELD(firstColType);
        READ_INT_FIELD(firstColTypmod);
 -      READ_OID_FIELD(firstColCollation);
 +      if (portable_input)
 +              READ_COLLID_FIELD(firstColCollation);
 +      else
 +              READ_OID_FIELD(firstColCollation);
        READ_BOOL_FIELD(useHashTable);
        READ_BOOL_FIELD(unknownEqFalse);
+       READ_BOOL_FIELD(parallel_safe);
        READ_NODE_FIELD(setParam);
        READ_NODE_FIELD(parParam);
        READ_NODE_FIELD(args);
@@@ -3486,204 -2364,40 +3600,238 @@@ _readExtensibleNode(void
        READ_DONE();
  }
  
 +
 +/*
 + * _readRemoteSubplan
 + */
 +static RemoteSubplan *
 +_readRemoteSubplan(void)
 +{
 +      READ_SCAN_FIELDS(RemoteSubplan);
 +
 +      READ_CHAR_FIELD(distributionType);
 +      READ_INT_FIELD(distributionKey);
 +      READ_NODE_FIELD(distributionNodes);
 +      READ_NODE_FIELD(distributionRestrict);
 +      READ_NODE_FIELD(nodeList);
 +      READ_BOOL_FIELD(execOnAll);
 +      READ_NODE_FIELD(sort);
 +      READ_STRING_FIELD(cursor);
 +      READ_INT_FIELD(unique);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readRemoteStmt
 + */
 +static RemoteStmt *
 +_readRemoteStmt(void)
 +{
 +      int i;
 +      READ_LOCALS(RemoteStmt);
 +
 +      READ_ENUM_FIELD(commandType, CmdType);
 +      READ_BOOL_FIELD(hasReturning);
 +      READ_NODE_FIELD(planTree);
 +      READ_NODE_FIELD(rtable);
 +      READ_NODE_FIELD(resultRelations);
 +      READ_NODE_FIELD(subplans);
 +      READ_INT_FIELD(nParamExec);
 +      READ_INT_FIELD(nParamRemote);
 +      if (local_node->nParamRemote > 0)
 +      {
 +              local_node->remoteparams = (RemoteParam *) palloc(
 +                              local_node->nParamRemote * sizeof(RemoteParam));
 +              for (i = 0; i < local_node->nParamRemote; i++)
 +              {
 +                      RemoteParam *rparam = &(local_node->remoteparams[i]);
 +                      token = pg_strtok(&length); /* skip  :paramkind */
 +                      token = pg_strtok(&length);
 +                      rparam->paramkind = (ParamKind) atoi(token);
 +
 +                      token = pg_strtok(&length); /* skip  :paramid */
 +                      token = pg_strtok(&length);
 +                      rparam->paramid = atoi(token);
 +
 +                      token = pg_strtok(&length); /* skip  :paramused */
 +                      token = pg_strtok(&length);
 +                      rparam->paramused = atoi(token);
 +
 +                      token = pg_strtok(&length); /* skip  :paramtype */
 +                      if (portable_input)
 +                      {
 +                              char       *nspname; /* namespace name */
 +                              char       *typname; /* data type name */
 +                              token = pg_strtok(&length); /* get nspname */
 +                              nspname = nullable_string(token, length);
 +                              token = pg_strtok(&length); /* get typname */
 +                              typname = nullable_string(token, length);
 +                              if (typname)
 +                                      rparam->paramtype = get_typname_typid(typname,
 +                                                                                                                NSP_OID(nspname));
 +                              else
 +                                      rparam->paramtype = InvalidOid;
 +                      }
 +                      else
 +                      {
 +                              token = pg_strtok(&length);
 +                              rparam->paramtype = atooid(token);
 +                      }
 +              }
 +      }
 +      else
 +              local_node->remoteparams = NULL;
 +
 +      READ_NODE_FIELD(rowMarks);
 +      READ_CHAR_FIELD(distributionType);
 +      READ_INT_FIELD(distributionKey);
 +      READ_NODE_FIELD(distributionNodes);
 +      READ_NODE_FIELD(distributionRestrict);
 +
 +      READ_DONE();
 +}
 +
 +
 +/*
 + * _readSimpleSort
 + */
 +static SimpleSort *
 +_readSimpleSort(void)
 +{
 +      int i;
 +      READ_LOCALS(SimpleSort);
 +
 +      READ_INT_FIELD(numCols);
 +
 +      token = pg_strtok(&length);             /* skip :sortColIdx */
 +      local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->sortColIdx[i] = atoi(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :sortOperators */
 +      local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *oprname; /* operator name */
 +                      char       *leftnspname; /* left type namespace */
 +                      char       *leftname; /* left type name */
 +                      Oid                     oprleft; /* left type */
 +                      char       *rightnspname; /* right type namespace */
 +                      char       *rightname; /* right type name */
 +                      Oid                     oprright; /* right type */
 +                      /* token is already set to nspname */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get operator name */
 +                      oprname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type namespace */
 +                      leftnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* left type name */
 +                      leftname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type namespace */
 +                      rightnspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* right type name */
 +                      rightname = nullable_string(token, length);
 +                      if (leftname)
 +                              oprleft = get_typname_typid(leftname,
 +                                                                                      NSP_OID(leftnspname));
 +                      else
 +                              oprleft = InvalidOid;
 +                      if (rightname)
 +                              oprright = get_typname_typid(rightname,
 +                                                                                       NSP_OID(rightnspname));
 +                      else
 +                              oprright = InvalidOid;
 +                      local_node->sortOperators[i] = get_operid(oprname,
 +                                                                                                        oprleft,
 +                                                                                                        oprright,
 +                                                                                                        NSP_OID(nspname));
 +              }
 +              else
 +                      local_node->sortOperators[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :sortCollations */
 +      local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              if (portable_input)
 +              {
 +                      char       *nspname; /* namespace name */
 +                      char       *collname; /* collation name */
 +                      int             collencoding; /* collation encoding */
 +                      /* the token is already read */
 +                      nspname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get collname */
 +                      collname = nullable_string(token, length);
 +                      token = pg_strtok(&length); /* get nargs */
 +                      collencoding = atoi(token);
 +                      if (collname)
 +                              local_node->sortCollations[i] = get_collid(collname,
 +                                                                                                         collencoding,
 +                                                                                                         NSP_OID(nspname));
 +                      else
 +                              local_node->sortCollations[i] = InvalidOid;
 +              }
 +              else
 +                      local_node->sortCollations[i] = atooid(token);
 +      }
 +
 +      token = pg_strtok(&length);             /* skip :nullsFirst */
 +      local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
 +      for (i = 0; i < local_node->numCols; i++)
 +      {
 +              token = pg_strtok(&length);
 +              local_node->nullsFirst[i] = strtobool(token);
 +      }
 +
 +      READ_DONE();
 +}
 +
 +
+ /*
+  * _readPartitionBoundSpec
+  */
+ static PartitionBoundSpec *
+ _readPartitionBoundSpec(void)
+ {
+       READ_LOCALS(PartitionBoundSpec);
+       READ_CHAR_FIELD(strategy);
+       READ_NODE_FIELD(listdatums);
+       READ_NODE_FIELD(lowerdatums);
+       READ_NODE_FIELD(upperdatums);
+       /* XXX somebody forgot location field; too late to change for v10 */
+       local_node->location = -1;
+       READ_DONE();
+ }
+ /*
+  * _readPartitionRangeDatum
+  */
+ static PartitionRangeDatum *
+ _readPartitionRangeDatum(void)
+ {
+       READ_LOCALS(PartitionRangeDatum);
+       READ_BOOL_FIELD(infinite);
+       READ_NODE_FIELD(value);
+       /* XXX somebody forgot location field; too late to change for v10 */
+       local_node->location = -1;
+       READ_DONE();
+ }
  /*
   * parseNodeString
   *
@@@ -3914,12 -2638,10 +4072,16 @@@ parseNodeString(void
                return_value = _readAlternativeSubPlan();
        else if (MATCH("EXTENSIBLENODE", 14))
                return_value = _readExtensibleNode();
 +      else if (MATCH("REMOTESUBPLAN", 13))
 +              return_value = _readRemoteSubplan();
 +      else if (MATCH("REMOTESTMT", 10))
 +              return_value = _readRemoteStmt();
 +      else if (MATCH("SIMPLESORT", 10))
 +              return_value = _readSimpleSort();
+       else if (MATCH("PARTITIONBOUND", 14))
+               return_value = _readPartitionBoundSpec();
+       else if (MATCH("PARTRANGEDATUM", 14))
+               return_value = _readPartitionRangeDatum();
        else
        {
                elog(ERROR, "badly formatted node string \"%.32s\"...", token);
index 34bc42b19689b4379fb3b19416f9f11e0a3e24ab,78ca55bbd6dc1049f624bb0c401fee823dab1168..196c6194cb39de7efbab5bec49596e679f4c2240
@@@ -3,8 -3,7 +3,8 @@@
   * allpaths.c
   *      Routines to find possible search paths for processing a query
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -625,6 -646,14 +653,17 @@@ set_rel_consider_parallel(PlannerInfo *
                         * executed only once.
                         */
                        return;
+               case RTE_NAMEDTUPLESTORE:
+                       /*
+                        * tuplestore cannot be shared, at least without more
+                        * infrastructure to support that.
+                        */
+                       return;
++
++              case RTE_REMOTE_DUMMY:
++                      return;
        }
  
        /*
@@@ -1325,14 -1423,8 +1433,14 @@@ add_paths_to_append_rel(PlannerInfo *ro
  
                /* Generate a partial append path. */
                appendpath = create_append_path(rel, partial_subpaths, NULL,
-                                                                               parallel_workers);
+                                                                               parallel_workers, partitioned_rels);
 -              add_partial_path(rel, (Path *) appendpath);
 +
 +              /*
 +               * XL: In case we had to re-distribute the child relations, don't
 +               * do anything. Otherwise create_gather_path hits an Assert etc.
 +               */
 +              if (appendpath->path.parallel_safe)
 +                      add_partial_path(rel, (Path *) appendpath);
        }
  
        /*
index 485717accea120654a9f56fa6d5c4c1d5e5f9e68,cdb18d978db4e96a90f5330dca723000a5426243..6e4808d51bbc7e59197b4b316adeb2c4e60e0136
@@@ -60,8 -60,7 +60,8 @@@
   * values.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -131,7 -126,7 +131,8 @@@ bool               enable_nestloop = true
  bool          enable_material = true;
  bool          enable_mergejoin = true;
  bool          enable_hashjoin = true;
 +bool          enable_fast_query_shipping = true;
+ bool          enable_gathermerge = true;
  
  typedef struct
  {
@@@ -4781,29 -5083,96 +5098,122 @@@ page_size(double tuples, int width
        return ceil(relation_byte_size(tuples, width) / BLCKSZ);
  }
  
 +#ifdef XCP
 +void
 +cost_remote_subplan(Path *path,
 +                        Cost input_startup_cost, Cost input_total_cost,
 +                        double tuples, int width, int replication)
 +{
 +      Cost            startup_cost = input_startup_cost + remote_query_cost;
 +      Cost            run_cost = input_total_cost - input_startup_cost;
 +
 +      path->rows = tuples;
 +
 +      /*
 +       * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead.
 +       */
 +      run_cost += 2 * cpu_operator_cost * tuples;
 +
 +      /*
 +       * Estimate cost of sending data over network
 +       */
 +      run_cost += network_byte_cost * tuples * width * replication;
 +
 +      path->startup_cost = startup_cost;
 +      path->total_cost = startup_cost + run_cost;
 +}
 +#endif
++
+ /*
+  * Estimate the fraction of the work that each worker will do given the
+  * number of workers budgeted for the path.
+  */
+ static double
+ get_parallel_divisor(Path *path)
+ {
+       double          parallel_divisor = path->parallel_workers;
+       double          leader_contribution;
+       /*
+        * Early experience with parallel query suggests that when there is only
+        * one worker, the leader often makes a very substantial contribution to
+        * executing the parallel portion of the plan, but as more workers are
+        * added, it does less and less, because it's busy reading tuples from the
+        * workers and doing whatever non-parallel post-processing is needed.  By
+        * the time we reach 4 workers, the leader no longer makes a meaningful
+        * contribution.  Thus, for now, estimate that the leader spends 30% of
+        * its time servicing each worker, and the remainder executing the
+        * parallel plan.
+        */
+       leader_contribution = 1.0 - (0.3 * path->parallel_workers);
+       if (leader_contribution > 0)
+               parallel_divisor += leader_contribution;
+       return parallel_divisor;
+ }
+ /*
+  * compute_bitmap_pages
+  *
+  * compute number of pages fetched from heap in bitmap heap scan.
+  */
+ double
+ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
+                                        int loop_count, Cost *cost, double *tuple)
+ {
+       Cost            indexTotalCost;
+       Selectivity indexSelectivity;
+       double          T;
+       double          pages_fetched;
+       double          tuples_fetched;
+       /*
+        * Fetch total cost of obtaining the bitmap, as well as its total
+        * selectivity.
+        */
+       cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
+       /*
+        * Estimate number of main-table pages fetched.
+        */
+       tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
+       T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
+       if (loop_count > 1)
+       {
+               /*
+                * For repeated bitmap scans, scale up the number of tuples fetched in
+                * the Mackert and Lohman formula by the number of scans, so that we
+                * estimate the number of pages fetched by all the scans. Then
+                * pro-rate for one scan.
+                */
+               pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
+                                                                                       baserel->pages,
+                                                                                       get_indexpath_pages(bitmapqual),
+                                                                                       root);
+               pages_fetched /= loop_count;
+       }
+       else
+       {
+               /*
+                * For a single scan, the number of heap pages that need to be fetched
+                * is the same as the Mackert and Lohman formula for the case T <= b
+                * (ie, no re-reads needed).
+                */
+               pages_fetched =
+                       (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+       }
+       if (pages_fetched >= T)
+               pages_fetched = T;
+       else
+               pages_fetched = ceil(pages_fetched);
+       if (cost)
+               *cost = indexTotalCost;
+       if (tuple)
+               *tuple = tuples_fetched;
+       return pages_fetched;
+ }
index 637926ff3a46500eb84ca145a696471cdab10a71,94beeb858d8cddcd582ab7f61525bed5dcd3da07..af89e9d288366b996bc1f5389b68937f0fa41781
@@@ -5,8 -5,7 +5,8 @@@
   *      Planning is complete, we just need to convert the selected
   *      Path into a Plan.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -103,13 -81,7 +103,14 @@@ static Plan *create_join_plan(PlannerIn
  static Plan *create_append_plan(PlannerInfo *root, AppendPath *best_path);
  static Plan *create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path);
  static Result *create_result_plan(PlannerInfo *root, ResultPath *best_path);
 +#ifdef XCP
 +static void adjust_subplan_distribution(PlannerInfo *root, Distribution *pathd,
 +                                                Distribution *subd);
 +static RemoteSubplan *create_remotescan_plan(PlannerInfo *root,
 +                                         RemoteSubPath *best_path);
 +static char *get_internal_cursor(void);
 +#endif
+ static ProjectSet *create_project_set_plan(PlannerInfo *root, ProjectSetPath *best_path);
  static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path,
                                         int flags);
  static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path,
@@@ -218,13 -195,16 +224,17 @@@ static FunctionScan *make_functionscan(
                                  Index scanrelid, List *functions, bool funcordinality);
  static ValuesScan *make_valuesscan(List *qptlist, List *qpqual,
                                Index scanrelid, List *values_lists);
+ static TableFuncScan *make_tablefuncscan(List *qptlist, List *qpqual,
+                                  Index scanrelid, TableFunc *tablefunc);
  static CteScan *make_ctescan(List *qptlist, List *qpqual,
                         Index scanrelid, int ctePlanId, int cteParam);
+ static NamedTuplestoreScan *make_namedtuplestorescan(List *qptlist, List *qpqual,
+                                                Index scanrelid, char *enrname);
  static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
                                   Index scanrelid, int wtParam);
- static Append *make_append(List *appendplans, List *tlist);
+ static Append *make_append(List *appendplans, List *tlist, List *partitioned_rels);
 -static RecursiveUnion *make_recursive_union(List *tlist,
 +static RecursiveUnion *make_recursive_union(PlannerInfo *root,
 +                                       List *tlist,
                                         Plan *lefttree,
                                         Plan *righttree,
                                         int wtParam,
@@@ -300,14 -280,9 +310,16 @@@ static ModifyTable *make_modifytable(Pl
                                 List *resultRelations, List *subplans,
                                 List *withCheckOptionLists, List *returningLists,
                                 List *rowMarks, OnConflictExpr *onconflict, int epqParam);
+ static GatherMerge *create_gather_merge_plan(PlannerInfo *root,
+                                                GatherMergePath *best_path);
  
 +#ifdef XCP
 +static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll,
 +                              bool nulls_first,int numCols, AttrNumber *sortColIdx,
 +                              Oid *sortOperators, Oid *collations, bool *nullsFirst);
 +#endif
 +
 +static RemoteSubplan *find_push_down_plan(Plan *plan, bool force);
  
  /*
   * create_plan
@@@ -509,8 -479,12 +527,12 @@@ create_plan_recurse(PlannerInfo *root, 
                case T_Limit:
                        plan = (Plan *) create_limit_plan(root,
                                                                                          (LimitPath *) best_path,
 -                                                                                        flags);
 +                                                                                        flags, 0, 1);
                        break;
+               case T_GatherMerge:
+                       plan = (Plan *) create_gather_merge_plan(root,
+                                                                                         (GatherMergePath *) best_path);
+                       break;
                default:
                        elog(ERROR, "unrecognized node type: %d",
                                 (int) best_path->pathtype);
@@@ -1297,17 -1323,10 +1371,18 @@@ create_unique_plan(PlannerInfo *root, U
                 */
                if (!is_projection_capable_plan(subplan) &&
                        !tlist_same_exprs(newtlist, subplan->targetlist))
-                       subplan = inject_projection_plan(subplan, newtlist);
+                       subplan = inject_projection_plan(subplan, newtlist,
+                                                                                        best_path->path.parallel_safe);
                else
                        subplan->targetlist = newtlist;
 +#ifdef XCP
 +              /*
 +               * RemoteSubplan is conditionally projection capable - it is pushing
 +               * projection to the data nodes
 +               */
 +              if (IsA(subplan, RemoteSubplan))
 +                      subplan->lefttree->targetlist = newtlist;
 +#endif
        }
  
        /*
@@@ -1911,36 -2000,8 +2057,37 @@@ create_minmaxagg_plan(PlannerInfo *root
                plan->plan_rows = 1;
                plan->plan_width = mminfo->path->pathtarget->width;
                plan->parallel_aware = false;
+               plan->parallel_safe = mminfo->path->parallel_safe;
  
 +              /*
 +               * XL: Add a remote subplan, splitting the LIMIT into a remote and
 +               * local part LIMIT parts.
 +               *
 +               * XXX This should probably happen when constructing the path in
 +               * create_minmaxagg_path(), not this late.
 +               *
 +               * XXX The costing in here is mostly bogus. Not that it'd matter
 +               * this late, though.
 +               */
 +              if (mminfo->path->distribution)
 +              {
 +                      plan = (Plan *) make_remotesubplan(root, plan,
 +                                                                                         NULL,
 +                                                                                         mminfo->path->distribution,
 +                                                                                         mminfo->path->pathkeys);
 +
 +                      plan = (Plan *) make_limit(plan,
 +                                                                         subparse->limitOffset,
 +                                                                         subparse->limitCount,
 +                                                                         0, 1);
 +
 +                      plan->startup_cost = mminfo->path->startup_cost;
 +                      plan->total_cost = mminfo->pathcost;
 +                      plan->plan_rows = 1;
 +                      plan->plan_width = mminfo->path->pathtarget->width;
 +                      plan->parallel_aware = false;
 +              }
 +
                /* Convert the plan into an InitPlan in the outer query. */
                SS_make_initplan_from_plan(root, subroot, plan, mminfo->param);
        }
@@@ -5204,361 -5233,6 +5538,361 @@@ make_worktablescan(List *qptlist
        return node;
  }
  
-                                       tle = tlist_member((Node *) em->em_expr, tlist);
 +#ifdef XCP
 +/*
 + * make_remotesubplan
 + *    Create a RemoteSubplan node to execute subplan on remote nodes.
 + *  leftree - the subplan which we want to push down to remote node.
 + *  resultDistribution - the distribution of the remote result. May be NULL -
 + * results are coming to the invoking node
 + *  execDistribution - determines how source data of the subplan are
 + * distributed, where we should send the subplan and how combine results.
 + *    pathkeys - the remote subplan is sorted according to these keys, executor
 + *            should perform merge sort of incoming tuples
 + */
 +RemoteSubplan *
 +make_remotesubplan(PlannerInfo *root,
 +                                 Plan *lefttree,
 +                                 Distribution *resultDistribution,
 +                                 Distribution *execDistribution,
 +                                 List *pathkeys)
 +{
 +      RemoteSubplan *node = makeNode(RemoteSubplan);
 +      Plan       *plan = &node->scan.plan;
 +      Bitmapset  *tmpset;
 +      int                     nodenum;
 +
 +      /* Sanity checks */
 +      Assert(!equal(resultDistribution, execDistribution));
 +      Assert(!IsA(lefttree, RemoteSubplan));
 +
 +      if (resultDistribution)
 +      {
 +              node->distributionType = resultDistribution->distributionType;
 +              node->distributionKey = InvalidAttrNumber;
 +              if (resultDistribution->distributionExpr)
 +              {
 +                      ListCell   *lc;
 +                      Expr       *expr;
 +
 +                      /* XXX Is that correct to reference a column of different type? */
 +                      if (IsA(resultDistribution->distributionExpr, RelabelType))
 +                              expr = ((RelabelType *) resultDistribution->distributionExpr)->arg;
 +                      else
 +                              expr = (Expr *) resultDistribution->distributionExpr;
 +
 +                      /* Find distribution expression in the target list */
 +                      foreach(lc, lefttree->targetlist)
 +                      {
 +                              TargetEntry *tle = (TargetEntry *) lfirst(lc);
 +
 +                              if (equal(tle->expr, expr))
 +                              {
 +                                      node->distributionKey = tle->resno;
 +                                      break;
 +                              }
 +                      }
 +
 +                      if (node->distributionKey == InvalidAttrNumber)
 +                      {
 +                              TargetEntry *newtle;
 +
 +                              /* The expression is not found, need to add junk */
 +                              newtle = makeTargetEntry(expr,
 +                                                                               list_length(lefttree->targetlist) + 1,
 +                                                                           NULL,
 +                                                                               true);
 +
 +                              if (is_projection_capable_plan(lefttree))
 +                              {
 +                                      /* Ok to modify subplan's target list */
 +                                      lefttree->targetlist = lappend(lefttree->targetlist, newtle);
 +                              }
 +                              else
 +                              {
 +                                      /* Use Result node to calculate expression */
 +                                      List *newtlist = list_copy(lefttree->targetlist);
 +                                      newtlist = lappend(newtlist, newtle);
 +                                      lefttree = (Plan *) make_result(newtlist, NULL, lefttree);
 +                              }
 +
 +                              node->distributionKey = newtle->resno;
 +                      }
 +              }
 +              /*
 +               * The distributionNodes describes result distribution
 +               */
 +              tmpset = bms_copy(resultDistribution->nodes);
 +              node->distributionNodes = NIL;
 +              while ((nodenum = bms_first_member(tmpset)) >= 0)
 +                      node->distributionNodes = lappend_int(node->distributionNodes,
 +                                                                                                nodenum);
 +              bms_free(tmpset);
 +              /*
 +               * The distributionRestrict defines the set of nodes where results are
 +               * actually shipped. These are the nodes where upper level step
 +               * is executed.
 +               */
 +              if (resultDistribution->restrictNodes)
 +              {
 +                      tmpset = bms_copy(resultDistribution->restrictNodes);
 +                      node->distributionRestrict = NIL;
 +                      while ((nodenum = bms_first_member(tmpset)) >= 0)
 +                              node->distributionRestrict =
 +                                              lappend_int(node->distributionRestrict, nodenum);
 +                      bms_free(tmpset);
 +              }
 +              else
 +                      node->distributionRestrict = list_copy(node->distributionNodes);
 +      }
 +      else
 +      {
 +              node->distributionType = LOCATOR_TYPE_NONE;
 +              node->distributionKey = InvalidAttrNumber;
 +              node->distributionNodes = NIL;
 +      }
 +
 +      /* determine where subplan will be executed */
 +      if (execDistribution)
 +      {
 +              if (execDistribution->restrictNodes)
 +                      tmpset = bms_copy(execDistribution->restrictNodes);
 +              else
 +                      tmpset = bms_copy(execDistribution->nodes);
 +              node->nodeList = NIL;
 +              while ((nodenum = bms_first_member(tmpset)) >= 0)
 +                      node->nodeList = lappend_int(node->nodeList, nodenum);
 +              bms_free(tmpset);
 +              node->execOnAll = list_length(node->nodeList) == 1 ||
 +                              !IsLocatorReplicated(execDistribution->distributionType);
 +      }
 +      else
 +      {
 +              /*
 +               * Prepare single execution of replicated subplan. Choose one node from
 +               * the execution node list, preferrably the node is also a member of
 +               * the list of result nodes, so later all node executors contact the
 +               * same node to get tuples
 +               */
 +              tmpset = NULL;
 +              if (!bms_is_empty(resultDistribution->restrictNodes))
 +                      tmpset = bms_copy(resultDistribution->restrictNodes);
 +              else
 +                      tmpset = bms_copy(resultDistribution->nodes);
 +              /*
 +               * If result goes on single node execute subplan locally
 +               */
 +              if (bms_num_members(tmpset) > 1)
 +              {
 +                      /* get one execution node TODO: load balancing */
 +                      nodenum = bms_any_member(tmpset);
 +                      node->nodeList = list_make1_int(nodenum);
 +                      node->execOnAll = true;
 +              }
 +              else
 +              {
 +                      node->nodeList = NIL;
 +                      node->execOnAll = false;
 +              }
 +              bms_free(tmpset);
 +      }
 +
 +      /* We do not need to merge sort if only one node is yielding tuples */
 +      if (pathkeys && node->execOnAll && list_length(node->nodeList) > 1)
 +      {
 +              List       *tlist = lefttree->targetlist;
 +              ListCell   *i;
 +              int                     numsortkeys;
 +              AttrNumber *sortColIdx;
 +              Oid                *sortOperators;
 +              Oid                *collations;
 +              bool       *nullsFirst;
 +
 +              /*
 +               * We will need at most list_length(pathkeys) sort columns; possibly less
 +               */
 +              numsortkeys = list_length(pathkeys);
 +              sortColIdx = (AttrNumber *) palloc(numsortkeys * sizeof(AttrNumber));
 +              sortOperators = (Oid *) palloc(numsortkeys * sizeof(Oid));
 +              collations = (Oid *) palloc(numsortkeys * sizeof(Oid));
 +              nullsFirst = (bool *) palloc(numsortkeys * sizeof(bool));
 +
 +              numsortkeys = 0;
 +
 +              foreach(i, pathkeys)
 +              {
 +                      PathKey    *pathkey = (PathKey *) lfirst(i);
 +                      EquivalenceClass *ec = pathkey->pk_eclass;
 +                      TargetEntry *tle = NULL;
 +                      Oid                     pk_datatype = InvalidOid;
 +                      Oid                     sortop;
 +                      ListCell   *j;
 +
 +                      if (ec->ec_has_volatile)
 +                      {
 +                              /*
 +                               * If the pathkey's EquivalenceClass is volatile, then it must
 +                               * have come from an ORDER BY clause, and we have to match it to
 +                               * that same targetlist entry.
 +                               */
 +                              if (ec->ec_sortref == 0)        /* can't happen */
 +                                      elog(ERROR, "volatile EquivalenceClass has no sortref");
 +                              tle = get_sortgroupref_tle(ec->ec_sortref, tlist);
 +                              Assert(tle);
 +                              Assert(list_length(ec->ec_members) == 1);
 +                              pk_datatype = ((EquivalenceMember *) linitial(ec->ec_members))->em_datatype;
 +                      }
 +                      else
 +                      {
 +                              /*
 +                               * Otherwise, we can sort by any non-constant expression listed in
 +                               * the pathkey's EquivalenceClass.  For now, we take the first one
 +                               * that corresponds to an available item in the tlist.  If there
 +                               * isn't any, use the first one that is an expression in the
 +                               * input's vars.  (The non-const restriction only matters if the
 +                               * EC is below_outer_join; but if it isn't, it won't contain
 +                               * consts anyway, else we'd have discarded the pathkey as
 +                               * redundant.)
 +                               *
 +                               * XXX if we have a choice, is there any way of figuring out which
 +                               * might be cheapest to execute?  (For example, int4lt is likely
 +                               * much cheaper to execute than numericlt, but both might appear
 +                               * in the same equivalence class...)  Not clear that we ever will
 +                               * have an interesting choice in practice, so it may not matter.
 +                               */
 +                              foreach(j, ec->ec_members)
 +                              {
 +                                      EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
 +
 +                                      if (em->em_is_const)
 +                                              continue;
 +
-                                       tle = tlist_member_ignore_relabel((Node *) em->em_expr, tlist);
++                                      tle = tlist_member(em->em_expr, tlist);
 +                                      if (tle)
 +                                      {
 +                                              pk_datatype = em->em_datatype;
 +                                              break;          /* found expr already in tlist */
 +                                      }
 +
 +                                      /*
 +                                       * We can also use it if the pathkey expression is a relabel
 +                                       * of the tlist entry, or vice versa.  This is needed for
 +                                       * binary-compatible cases (cf. make_pathkey_from_sortinfo).
 +                                       * We prefer an exact match, though, so we do the basic search
 +                                       * first.
 +                                       */
++                                      tle = tlist_member_ignore_relabel(em->em_expr, tlist);
 +                                      if (tle)
 +                                      {
 +                                              pk_datatype = em->em_datatype;
 +                                              break;          /* found expr already in tlist */
 +                                      }
 +                              }
 +
 +                              if (!tle)
 +                              {
 +                                      /* No matching tlist item; look for a computable expression */
 +                                      Expr       *sortexpr = NULL;
 +
 +                                      foreach(j, ec->ec_members)
 +                                      {
 +                                              EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
 +                                              List       *exprvars;
 +                                              ListCell   *k;
 +
 +                                              if (em->em_is_const)
 +                                                      continue;
 +                                              sortexpr = em->em_expr;
 +                                              exprvars = pull_var_clause((Node *) sortexpr,
 +                                                                                                 PVC_INCLUDE_AGGREGATES |
 +                                                                                                 PVC_INCLUDE_PLACEHOLDERS);
 +                                              foreach(k, exprvars)
 +                                              {
 +                                                      if (!tlist_member_ignore_relabel(lfirst(k), tlist))
 +                                                              break;
 +                                              }
 +                                              list_free(exprvars);
 +                                              if (!k)
 +                                              {
 +                                                      pk_datatype = em->em_datatype;
 +                                                      break;  /* found usable expression */
 +                                              }
 +                                      }
 +                                      if (!j)
 +                                              elog(ERROR, "could not find pathkey item to sort");
 +
 +                                      /*
 +                                       * Do we need to insert a Result node?
 +                                       */
 +                                      if (!is_projection_capable_plan(lefttree))
 +                                      {
 +                                              /* copy needed so we don't modify input's tlist below */
 +                                              tlist = copyObject(tlist);
 +                                              lefttree = (Plan *) make_result(tlist, NULL, lefttree);
 +                                      }
 +
 +                                      /*
 +                                       * Add resjunk entry to input's tlist
 +                                       */
 +                                      tle = makeTargetEntry(sortexpr,
 +                                                                                list_length(tlist) + 1,
 +                                                                                NULL,
 +                                                                                true);
 +                                      tlist = lappend(tlist, tle);
 +                                      lefttree->targetlist = tlist;   /* just in case NIL before */
 +                              }
 +                      }
 +
 +                      /*
 +                       * Look up the correct sort operator from the PathKey's slightly
 +                       * abstracted representation.
 +                       */
 +                      sortop = get_opfamily_member(pathkey->pk_opfamily,
 +                                                                               pk_datatype,
 +                                                                               pk_datatype,
 +                                                                               pathkey->pk_strategy);
 +                      if (!OidIsValid(sortop))        /* should not happen */
 +                              elog(ERROR, "could not find member %d(%u,%u) of opfamily %u",
 +                                       pathkey->pk_strategy, pk_datatype, pk_datatype,
 +                                       pathkey->pk_opfamily);
 +
 +                      /*
 +                       * The column might already be selected as a sort key, if the pathkeys
 +                       * contain duplicate entries.  (This can happen in scenarios where
 +                       * multiple mergejoinable clauses mention the same var, for example.)
 +                       * So enter it only once in the sort arrays.
 +                       */
 +                      numsortkeys = add_sort_column(tle->resno,
 +                                                                                sortop,
 +                                                                                pathkey->pk_eclass->ec_collation,
 +                                                                                pathkey->pk_nulls_first,
 +                                                                                numsortkeys,
 +                                                                                sortColIdx, sortOperators,
 +                                                                                collations, nullsFirst);
 +              }
 +              Assert(numsortkeys > 0);
 +
 +              node->sort = makeNode(SimpleSort);
 +              node->sort->numCols = numsortkeys;
 +              node->sort->sortColIdx = sortColIdx;
 +              node->sort->sortOperators = sortOperators;
 +              node->sort->sortCollations = collations;
 +              node->sort->nullsFirst = nullsFirst;
 +      }
 +
 +      plan->qual = NIL;
 +      plan->targetlist = lefttree->targetlist;
 +      plan->lefttree = lefttree;
 +      plan->righttree = NULL;
 +      copy_plan_costsize(plan, lefttree);
 +
 +      node->cursor = get_internal_cursor();
 +      node->unique = 0;
 +      return node;
 +}
 +#endif /* XCP */
 +
 +
  ForeignScan *
  make_foreignscan(List *qptlist,
                                 List *qpqual,
@@@ -7027,15 -6594,15 +7409,24 @@@ is_projection_capable_plan(Plan *plan
                case T_MergeAppend:
                case T_RecursiveUnion:
                        return false;
 +#ifdef XCP
 +              /*
 +               * Remote subplan may push down projection to the data nodes if do not
 +               * performs merge sort
 +               */
 +              case T_RemoteSubplan:
 +                      return ((RemoteSubplan *) plan)->sort == NULL &&
 +                                      is_projection_capable_plan(plan->lefttree);
 +#endif
+               case T_ProjectSet:
+                       /*
+                        * Although ProjectSet certainly projects, say "no" because we
+                        * don't want the planner to randomly replace its tlist with
+                        * something else; the SRFs have to stay at top level.  This might
+                        * get relaxed later.
+                        */
+                       return false;
                default:
                        break;
        }
index f7d6dace59deacefdb925305b708d1577d5ff8cc,55657360fc380a6a8b76e44912f66504c38c706b..c9331d272ad44b0b76f7fbafa0709d1919ff3f6b
@@@ -17,8 -17,7 +17,8 @@@
   * scan all the rows anyway.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 89031d265eaf4f5756cef685c44c5d846f1ea89e,40cb79d4cd23ef90d0aa31dabf4ab52899932efb..b49a91a3b0c2c7206b6015cf69bfc8a9e92ca93c
@@@ -3,8 -3,7 +3,8 @@@
   * planner.c
   *      The query optimizer external interface.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -158,16 -182,9 +188,19 @@@ static List *make_pathkeys_for_window(P
  static PathTarget *make_sort_input_target(PlannerInfo *root,
                                           PathTarget *final_target,
                                           bool *have_postponed_srfs);
 +static bool equal_distributions(PlannerInfo *root, Distribution *dst1,
 +                                      Distribution *dst2);
 +static bool grouping_distribution_match(PlannerInfo *root, Query *parse,
 +                                        Path *path, List *clauses);
 +static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse,
 +                                        Path *path);
 +static Path *adjust_path_distribution(PlannerInfo *root, Query *parse,
 +                                        Path *path);
 +static bool can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path);
 +static bool can_push_down_window(PlannerInfo *root, Path *path);
+ static void adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel,
+                                         List *targets, List *targets_contain_srfs);
  
  /*****************************************************************************
   *
@@@ -216,17 -224,6 +249,11 @@@ standard_planner(Query *parse, int curs
        ListCell   *lp,
                           *lr;
  
-       /* Cursor options may come from caller or from DECLARE CURSOR stmt */
-       if (parse->utilityStmt &&
-               IsA(parse->utilityStmt, DeclareCursorStmt))
-               cursorOptions |= ((DeclareCursorStmt *) parse->utilityStmt)->options;
 +#ifdef XCP
 +      if (IS_PGXC_LOCAL_COORDINATOR && parse->utilityStmt &&
 +                      IsA(parse->utilityStmt, RemoteQuery))
 +              return pgxc_direct_planner(parse, cursorOptions, boundParams);
 +#endif
        /*
         * Set up global state for this planner invocation.  This data is needed
         * across all levels of sub-Query that might exist in the given command,
        result->rowMarks = glob->finalrowmarks;
        result->relationOids = glob->relationOids;
        result->invalItems = glob->invalItems;
 +#ifdef XCP
 +      result->distributionType = LOCATOR_TYPE_NONE;
 +      result->distributionKey = InvalidAttrNumber;
 +      result->distributionNodes = NULL;
 +#endif
        result->nParamExec = glob->nParamExec;
+       /* utilityStmt should be null, but we might as well copy it */
+       result->utilityStmt = parse->utilityStmt;
+       result->stmt_location = parse->stmt_location;
+       result->stmt_len = parse->stmt_len;
  
        return result;
  }
@@@ -538,9 -512,8 +570,10 @@@ subquery_planner(PlannerGlobal *glob, Q
        memset(root->upper_targets, 0, sizeof(root->upper_targets));
        root->processed_tlist = NIL;
        root->grouping_map = NULL;
 +      root->recursiveOk = true;
 +
        root->minmax_aggs = NIL;
+       root->qual_security_level = 0;
        root->hasInheritedTarget = false;
        root->hasRecursion = hasRecursion;
        if (hasRecursion)
@@@ -2368,31 -2332,6 +2537,32 @@@ preprocess_rowmarks(PlannerInfo *root
                 */
                CheckSelectLocking(parse, ((RowMarkClause *)
                                                                   linitial(parse->rowMarks))->strength);
-                       Bitmapset  *baserels = get_base_rel_indexes((Node *) parse->jointree);
 +
 +              if (parse->jointree)
 +              {
++                      Bitmapset  *baserels = get_relids_in_jointree((Node *)
++                                      parse->jointree, false);
 +                      int x, num_rels = 0;
 +                      bool dist_found = false;
 +
 +                      while ((x = bms_first_member(baserels)) >= 0)
 +                      {
 +                              RangeTblEntry *rte = rt_fetch(x, parse->rtable);
 +                              RelationLocInfo *locinfo = NULL;
 +                              if (OidIsValid(rte->relid))
 +                                      locinfo = GetRelationLocInfo(rte->relid);
 +                              if (locinfo && !IsRelationReplicated(locinfo))
 +                                      dist_found = true;
 +                              num_rels++;
 +                      }
 +
 +                      if (dist_found && num_rels > 1)
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                                               errmsg("%s is not allowed with joins",
 +                                                       LCS_asString(((RowMarkClause *)
 +                                                                       linitial(parse->rowMarks))->strength))));
 +              }
        }
        else
        {
@@@ -4075,49 -4015,45 +4340,61 @@@ create_grouping_paths(PlannerInfo *root
  
        if (can_hash)
        {
-               hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
-                                                                                                         agg_costs,
-                                                                                                         dNumGroups);
-               /*
-                * Provided that the estimated size of the hashtable does not exceed
-                * work_mem, we'll generate a HashAgg Path, although if we were unable
-                * to sort above, then we'd better generate a Path, so that we at
-                * least have one.
-                */
-               if (hashaggtablesize < work_mem * 1024L ||
-                       grouped_rel->pathlist == NIL)
+               if (parse->groupingSets)
                {
-                       /* Don't mess with the cheapest path directly. */
-                       Path *path = cheapest_path;
                        /*
-                        * If the grouping can't be fully pushed down, we'll push down the
-                        * first phase of the aggregate, and redistribute only the partial
-                        * results.
-                        *
-                        * If if can be pushed down, disable construction of complex
-                        * distributed paths.
+                        * Try for a hash-only groupingsets path over unsorted input.
                         */
-                       if (! can_push_down_grouping(root, parse, path))
-                               path = create_remotesubplan_path(root, path, NULL);
-                       else
-                               try_distributed_aggregation = false;
+                       consider_groupingsets_paths(root, grouped_rel,
+                                                                               cheapest_path, false, true, target,
+                                                                               gd, agg_costs, dNumGroups);
+               }
+               else
+               {
+                       hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+                                                                                                                 agg_costs,
+                                                                                                                 dNumGroups);
  
                        /*
-                        * We just need an Agg over the cheapest-total input path, since
-                        * input order won't matter.
+                        * Provided that the estimated size of the hashtable does not
+                        * exceed work_mem, we'll generate a HashAgg Path, although if we
+                        * were unable to sort above, then we'd better generate a Path, so
+                        * that we at least have one.
                         */
-                       add_path(grouped_rel, (Path *)
-                                        create_agg_path(root, grouped_rel,
-                                                                        path,
-                                                                        target,
-                                                                        AGG_HASHED,
-                                                                        AGGSPLIT_SIMPLE,
-                                                                        parse->groupClause,
-                                                                        (List *) parse->havingQual,
-                                                                        agg_costs,
-                                                                        dNumGroups));
+                       if (hashaggtablesize < work_mem * 1024L ||
+                               grouped_rel->pathlist == NIL)
+                       {
++                              /* Don't mess with the cheapest path directly. */
++                              Path *path = cheapest_path;
++
++                              /*
++                               * If the grouping can't be fully pushed down, we'll push down the
++                               * first phase of the aggregate, and redistribute only the partial
++                               * results.
++                               *
++                               * If if can be pushed down, disable construction of complex
++                               * distributed paths.
++                               */
++                              if (! can_push_down_grouping(root, parse, path))
++                                      path = create_remotesubplan_path(root, path, NULL);
++                              else
++                                      try_distributed_aggregation = false;
++
+                               /*
+                                * We just need an Agg over the cheapest-total input path,
+                                * since input order won't matter.
+                                */
+                               add_path(grouped_rel, (Path *)
+                                                create_agg_path(root, grouped_rel,
 -                                                                               cheapest_path,
++                                                                               path,
+                                                                                target,
+                                                                                AGG_HASHED,
+                                                                                AGGSPLIT_SIMPLE,
+                                                                                parse->groupClause,
+                                                                                (List *) parse->havingQual,
+                                                                                agg_costs,
+                                                                                dNumGroups));
+                       }
                }
  
                /*
                }
        }
  
 -      /* Give a helpful error if we failed to find any implementation */
 -      if (grouped_rel->pathlist == NIL)
 -              ereport(ERROR,
 -                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 -                               errmsg("could not implement GROUP BY"),
 -                               errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
 -
 -      /*
 -       * If there is an FDW that's responsible for all baserels of the query,
 -       * let it consider adding ForeignPaths.
 -       */
 -      if (grouped_rel->fdwroutine &&
 -              grouped_rel->fdwroutine->GetForeignUpperPaths)
 -              grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
 -                                                                                                        input_rel, grouped_rel);
 -
 -      /* Let extensions possibly add some more paths */
 -      if (create_upper_paths_hook)
 -              (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
 -                                                                      input_rel, grouped_rel);
 -
 -      /* Now choose the best path(s) */
 -      set_cheapest(grouped_rel);
 -
 -      /*
 -       * We've been using the partial pathlist for the grouped relation to hold
 -       * partially aggregated paths, but that's actually a little bit bogus
 -       * because it's unsafe for later planning stages -- like ordered_rel ---
 -       * to get the idea that they can use these partial paths as if they didn't
 -       * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
 -       * so we don't get confused.
 -       */
 -      grouped_rel->partial_pathlist = NIL;
 -
 -      return grouped_rel;
 -}
 -
 -
 -/*
 - * For a given input path, consider the possible ways of doing grouping sets on
 - * it, by combinations of hashing and sorting.  This can be called multiple
 - * times, so it's important that it not scribble on input.  No result is
 - * returned, but any generated paths are added to grouped_rel.
 - */
 -static void
 -consider_groupingsets_paths(PlannerInfo *root,
 -                                                      RelOptInfo *grouped_rel,
 -                                                      Path *path,
 -                                                      bool is_sorted,
 -                                                      bool can_hash,
 -                                                      PathTarget *target,
 -                                                      grouping_sets_data *gd,
 -                                                      const AggClauseCosts *agg_costs,
 -                                                      double dNumGroups)
 -{
 -      Query      *parse = root->parse;
 +      /* Generate XL aggregate paths, with distributed 2-phase aggregation. */
  
        /*
 -       * If we're not being offered sorted input, then only consider plans that
 -       * can be done entirely by hashing.
 +       * If there were no partial paths, we did not initialize any of the
 +       * partial paths above. If that's the case, initialize here.
         *
 -       * We can hash everything if it looks like it'll fit in work_mem. But if
 -       * the input is actually sorted despite not being advertised as such, we
 -       * prefer to make use of that in order to use less memory.
 +       * XXX The reason why the initialization block at the beginning is not
 +       * simply performed unconditionally is that we may skip it if we've been
 +       * successful in fully pushing down any of the aggregates, and entirely
 +       * skip generating the XL paths.
         *
 -       * If none of the grouping sets are sortable, then ignore the work_mem
 -       * limit and generate a path anyway, since otherwise we'll just fail.
 +       * XXX Can we simply use the same estimates as regular partial aggregates,
 +       * or do we need to invent something else? It might be a better idea to
 +       * use estimates for the whole result here (e.g. total number of groups)
 +       * instead of the partial ones. Underestimates often have more severe
 +       * consequences (e.g. OOM with HashAggregate) than overestimates, so this
 +       * seems like a more defensive approach.
 +       *
 +       * XXX After thinking a bit more about the estimation, it may depend on
 +       * pushdown - if the aggregate is fully pushed down (as above, we can
 +       * probably use dNumGroups/numberOfNodes as a cardinality estimate, as
 +       * we know the per-node groupings won't overlap. But here we need to be
 +       * more careful.
         */
 -      if (!is_sorted)
 +      if (try_distributed_aggregation)
        {
 -              List       *new_rollups = NIL;
 -              RollupData *unhashed_rollup = NULL;
 -              List       *sets_data;
 -              List       *empty_sets_data = NIL;
 -              List       *empty_sets = NIL;
 -              ListCell   *lc;
 -              ListCell   *l_start = list_head(gd->rollups);
 -              AggStrategy strat = AGG_HASHED;
 -              Size            hashsize;
 -              double          exclude_groups = 0.0;
 -
 -              Assert(can_hash);
 -
 -              if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
 -              {
 -                      unhashed_rollup = lfirst(l_start);
 -                      exclude_groups = unhashed_rollup->numGroups;
 -                      l_start = lnext(l_start);
 -              }
 +              partial_grouping_target = make_partial_grouping_target(root, target);
  
 -              hashsize = estimate_hashagg_tablesize(path,
 -                                                                                        agg_costs,
 -                                                                                        dNumGroups - exclude_groups);
 +              /* Estimate number of partial groups. */
 +              dNumPartialGroups = get_number_of_groups(root,
 +                                                                                               cheapest_path->rows,
-                                                                                                NIL,
-                                                                                                NIL);
++                                                                                               gd);
  
                /*
 -               * gd->rollups is empty if we have only unsortable columns to work
 -               * with.  Override work_mem in that case; otherwise, we'll rely on the
 -               * sorted-input case to generate usable mixed paths.
 +               * Collect statistics about aggregates for estimating costs of
 +               * performing aggregation in parallel.
                 */
 -              if (hashsize > work_mem * 1024L && gd->rollups)
 -                      return;                         /* nope, won't fit */
 +              MemSet(&agg_partial_costs, 0, sizeof(AggClauseCosts));
 +              MemSet(&agg_final_costs, 0, sizeof(AggClauseCosts));
 +              if (parse->hasAggs)
 +              {
 +                      /* partial phase */
 +                      get_agg_clause_costs(root, (Node *) partial_grouping_target->exprs,
 +                                                               AGGSPLIT_INITIAL_SERIAL,
 +                                                               &agg_partial_costs);
 +
 +                      /* final phase */
 +                      get_agg_clause_costs(root, (Node *) target->exprs,
 +                                                               AGGSPLIT_FINAL_DESERIAL,
 +                                                               &agg_final_costs);
 +                      get_agg_clause_costs(root, parse->havingQual,
 +                                                               AGGSPLIT_FINAL_DESERIAL,
 +                                                               &agg_final_costs);
 +              }
 +      }
  
 +      /* Build final XL grouping paths */
 +      if (can_sort && try_distributed_aggregation)
 +      {
                /*
 -               * We need to burst the existing rollups list into individual grouping
 -               * sets and recompute a groupClause for each set.
 +               * Use any available suitably-sorted path as input, and also consider
 +               * sorting the cheapest-total path.
                 */
 -              sets_data = list_copy(gd->unsortable_sets);
 -
 -              for_each_cell(lc, l_start)
 +              foreach(lc, input_rel->pathlist)
                {
 -                      RollupData *rollup = lfirst(lc);
 +                      Path       *path = (Path *) lfirst(lc);
 +                      bool            is_sorted;
 +
 +                      is_sorted = pathkeys_contained_in(root->group_pathkeys,
-                                                                                         path->pathkeys);
++                                      path->pathkeys);
 +
 +                      /*
 +                       * XL: Can it happen that the cheapest path can't be pushed down,
 +                       * while some other path could be? Perhaps we should move the check
 +                       * if a path can be pushed down up, and add another OR condition
 +                       * to consider all paths that can be pushed down?
 +                       *
 +                       * if (path == cheapest_path || is_sorted || can_push_down)
 +                       */
 +                      if (path == cheapest_path || is_sorted)
 +                      {
 +                              /*
 +                               * We can't really beat paths that we managed to fully push
 +                               * down above, so we can skip them entirely.
 +                               *
 +                               * XXX Not constructing any paths, so we can do this before
 +                               * adding the Sort path.
 +                               */
 +                              if (can_push_down_grouping(root, parse, path))
 +                                      continue;
 +
 +                              /* Sort the cheapest-total path if it isn't already sorted */
 +                              if (!is_sorted)
 +                                      path = (Path *) create_sort_path(root,
 +                                                                                                       grouped_rel,
 +                                                                                                       path,
 +                                                                                                       root->group_pathkeys,
 +                                                                                                       -1.0);
 +
 +                              /* Now decide what to stick atop it */
 +                              if (parse->groupingSets)
 +                              {
 +                                      /*
 +                                       * TODO 2-phase aggregation for grouping sets paths not
 +                                       * supported yet, but this the place where such paths
 +                                       * should be constructed.
 +                                       */
 +                              }
 +                              else if (parse->hasAggs)
 +                              {
 +                                      /*
 +                                       * We have aggregation, possibly with plain GROUP BY. Make
 +                                       * an AggPath.
 +                                       */
 +
 +                                      path = (Path *) create_agg_path(root,
 +                                                                                                      grouped_rel,
 +                                                                                                      path,
 +                                                                                                      partial_grouping_target,
 +                                                                      parse->groupClause ? AGG_SORTED : AGG_PLAIN,
 +                                                                                                      AGGSPLIT_INITIAL_SERIAL,
 +                                                                                                      parse->groupClause,
 +                                                                                                      NIL,
 +                                                                                                      &agg_partial_costs,
 +                                                                                                      dNumPartialGroups);
 +
 +                                      path = create_remotesubplan_path(root, path, NULL);
 +
 +                                      /*
 +                                       * We generate two paths, differing in the second phase
 +                                       * implementation (sort and hash).
 +                                       */
 +
 +                                      add_path(grouped_rel, (Path *)
 +                                                       create_agg_path(root,
 +                                                                                       grouped_rel,
 +                                                                                       path,
 +                                                                                       target,
 +                                                                       parse->groupClause ? AGG_SORTED : AGG_PLAIN,
 +                                                                                       AGGSPLIT_FINAL_DESERIAL,
 +                                                                                       parse->groupClause,
 +                                                                                       (List *) parse->havingQual,
 +                                                                                       &agg_final_costs,
 +                                                                                       dNumGroups));
 +
 +                                      if (can_hash)
 +                                              add_path(grouped_rel, (Path *)
 +                                                               create_agg_path(root,
 +                                                                                               grouped_rel,
 +                                                                                               path,
 +                                                                                               target,
 +                                                                                               AGG_HASHED,
 +                                                                                               AGGSPLIT_FINAL_DESERIAL,
 +                                                                                               parse->groupClause,
 +                                                                                               (List *) parse->havingQual,
 +                                                                                               &agg_final_costs,
 +                                                                                               dNumGroups));
 +                              }
 +                              else if (parse->groupClause)
 +                              {
 +                                      /*
 +                                       * We have GROUP BY without aggregation or grouping sets.
 +                                       * Make a GroupPath.
 +                                       */
 +                                      path = (Path *) create_group_path(root,
 +                                                                                                        grouped_rel,
 +                                                                                                        path,
 +                                                                                                        partial_grouping_target,
 +                                                                                                        parse->groupClause,
 +                                                                                                        NIL,
 +                                                                                                        dNumPartialGroups);
 +
 +                                      path = create_remotesubplan_path(root, path, NULL);
 +
 +                                      add_path(grouped_rel, (Path *)
 +                                                       create_group_path(root,
 +                                                                                         grouped_rel,
 +                                                                                         path,
 +                                                                                         target,
 +                                                                                         parse->groupClause,
 +                                                                                         (List *) parse->havingQual,
 +                                                                                         dNumGroups));
 +
 +                              }
 +                              else
 +                              {
 +                                      /* Other cases should have been handled above */
 +                                      Assert(false);
 +                              }
 +                      }
 +              }
 +
 +              /*
 +               * So far we've only constructed simple paths combining partial and
 +               * distributed aggregate paths, i.e.
 +               *
 +               *     Finalize -> RemoteSubplan -> Gather -> Partial
 +               *
 +               * It may however be more efficient to reduce the amount of data
 +               * transferred over the network by generating paths like this:
 +               *
 +               *     Finalize -> RemoteSubplan -> Combine -> Gather -> Partial
 +               *
 +               * where Combine deserialized the aggstates, combines them and then
 +               * serializes them again. This AggSplit case is not defined yet, but
 +               * should not be hard to add.
 +               *
 +               * We only want to do this for partial paths with RemoteSubplan on
 +               * top of them, i.e. when the whole aggregate was not pushed down.
 +               *
 +               * XXX Gather output is never sorted, so we can only bother with the
 +               * cheapest partial path here (just like above).
 +               *
 +               * XXX This only generates paths with both the combine and finalize
 +               * steps using the same implementation (sort+sort or hash+hash). Maybe
 +               * we should relax that, and allow hash+sort or sort+hash?
 +               *
 +               * XXX grouped_rel->partial_pathlist may be empty here, if the planner
 +               * did not consider parallel paths (try_parallel_aggregation=false).
 +               * But that's OK - we only want to put the combine on top of a Gather,
 +               * so if there's none we're done.
 +               *
 +               * XXX The "combine" paths seem not to be picked up, most likely
 +               * because of bad costing, not reflecting the reduction in number of
 +               * rows transferred over the network.
 +               */
 +              if (grouped_rel->partial_pathlist)
 +              {
 +                      Path       *path = (Path *) linitial(grouped_rel->partial_pathlist);
 +                      double          total_groups = path->rows * path->parallel_workers;
 +
 +                      /* We don't care about paths that were fully pushed down. */
 +                      if (! can_push_down_grouping(root, parse, path))
 +                      {
 +                              path = (Path *) create_gather_path(root,
 +                                                                                                 grouped_rel,
 +                                                                                                 path,
 +                                                                                                 partial_grouping_target,
 +                                                                                                 NULL,
 +                                                                                                 &total_groups);
 +
 +                              /*
 +                               * Gather is always unsorted, so we'll need to sort, unless
 +                               * there's no GROUP BY clause, in which case there will only be a
 +                               * single group.
 +                               */
 +                              if (parse->groupClause)
 +                                      path = (Path *) create_sort_path(root,
 +                                                                                                       grouped_rel,
 +                                                                                                       path,
 +                                                                                                       root->group_pathkeys,
 +                                                                                                       -1.0);
 +
 +                              /* Intermediate combine phase. */
 +                              if (parse->hasAggs)
 +                              {
 +                                      path = (Path *) create_agg_path(root,
 +                                                                                                      grouped_rel,
 +                                                                                                      path,
 +                                                                                                      target,
 +                                                                      parse->groupClause ? AGG_SORTED : AGG_PLAIN,
 +                                                                                                      AGGSPLIT_COMBINE,
 +                                                                                                      parse->groupClause,
 +                                                                                                      (List *) parse->havingQual,
 +                                                                                                      &agg_final_costs,
 +                                                                                                      dNumGroups);
 +
 +                                      path = create_remotesubplan_path(root, path, NULL);
 +
 +                                      add_path(grouped_rel, (Path *)
 +                                                       create_agg_path(root,
 +                                                                                       grouped_rel,
 +                                                                                       path,
 +                                                                                       target,
 +                                                                       parse->groupClause ? AGG_SORTED : AGG_PLAIN,
 +                                                                                       AGGSPLIT_FINAL_DESERIAL,
 +                                                                                       parse->groupClause,
 +                                                                                       (List *) parse->havingQual,
 +                                                                                       &agg_final_costs,
 +                                                                                       dNumGroups));
 +                              }
 +                              else
 +                              {
 +                                      path = (Path *) create_group_path(root,
 +                                                                                                        grouped_rel,
 +                                                                                                        path,
 +                                                                                                        target,
 +                                                                                                        parse->groupClause,
 +                                                                                                        (List *) parse->havingQual,
 +                                                                                                        dNumGroups);
 +
 +                                      path = create_remotesubplan_path(root, path, NULL);
 +
 +                                      add_path(grouped_rel, (Path *)
 +                                                       create_group_path(root,
 +                                                                                         grouped_rel,
 +                                                                                         path,
 +                                                                                         target,
 +                                                                                         parse->groupClause,
 +                                                                                         (List *) parse->havingQual,
 +                                                                                         dNumGroups));
 +                              }
 +                      }
 +              }
 +      }
 +
 +      if (can_hash && try_distributed_aggregation)
 +      {
 +              hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
 +                                                                                                        agg_costs,
 +                                                                                                        dNumGroups);
 +
 +              /*
 +               * Provided that the estimated size of the hashtable does not exceed
 +               * work_mem, we'll generate a HashAgg Path, although if we were unable
 +               * to sort above, then we'd better generate a Path, so that we at
 +               * least have one.
 +               */
 +              if (hashaggtablesize < work_mem * 1024L ||
 +                      grouped_rel->pathlist == NIL)
 +              {
 +                      /* If the whole aggregate was pushed down, we're done. */
 +                      if (! can_push_down_grouping(root, parse, cheapest_path))
 +                      {
 +                              Path *path, *agg_path;
 +
 +                              path = (Path *) create_agg_path(root,
 +                                                                         grouped_rel,
 +                                                                         cheapest_path,
 +                                                                         partial_grouping_target,
 +                                                                         AGG_HASHED,
 +                                                                         AGGSPLIT_INITIAL_SERIAL,
 +                                                                         parse->groupClause,
 +                                                                         NIL,
 +                                                                         &agg_partial_costs,
 +                                                                         dNumPartialGroups);
 +
 +                              /* keep partially aggregated path for the can_sort branch */
 +                              agg_path = path;
 +
 +                              path = create_remotesubplan_path(root, path, NULL);
 +
 +                              /* Generate paths with both hash and sort second phase. */
 +
 +                              add_path(grouped_rel, (Path *)
 +                                               create_agg_path(root,
 +                                                                               grouped_rel,
 +                                                                               path,
 +                                                                               target,
 +                                                                               AGG_HASHED,
 +                                                                               AGGSPLIT_FINAL_DESERIAL,
 +                                                                               parse->groupClause,
 +                                                                               (List *) parse->havingQual,
 +                                                                               &agg_final_costs,
 +                                                                               dNumGroups));
 +
 +                              if (can_sort)
 +                              {
 +                                      /*
 +                                       * AGG_HASHED aggregate paths are always unsorted, so add
 +                                       * a Sorted node for the final AGG_SORTED step.
 +                                       */
 +                                      path = (Path *) create_sort_path(root,
 +                                                                                                       grouped_rel,
 +                                                                                                       agg_path,
 +                                                                                                       root->group_pathkeys,
 +                                                                                                       -1.0);
 +
 +                                      path = create_remotesubplan_path(root, path, NULL);
 +
 +                                      add_path(grouped_rel, (Path *)
 +                                                       create_agg_path(root,
 +                                                                                       grouped_rel,
 +                                                                                       path,
 +                                                                                       target,
 +                                                                       parse->groupClause ? AGG_SORTED : AGG_PLAIN,
 +                                                                                       AGGSPLIT_FINAL_DESERIAL,
 +                                                                                       parse->groupClause,
 +                                                                                       (List *) parse->havingQual,
 +                                                                                       &agg_final_costs,
 +                                                                                       dNumGroups));
 +                              }
 +                      }
 +              }
 +
 +              /*
 +               * Generate a path with the extra combine phase.
 +               *
 +               * XXX See the comments in the block generating combine paths for
 +               * the sorted case.
 +               */
 +              if (grouped_rel->partial_pathlist)
 +              {
 +                      Path       *path = (Path *) linitial(grouped_rel->partial_pathlist);
 +
 +                      hashaggtablesize = estimate_hashagg_tablesize(path,
 +                                                                                                                &agg_final_costs,
 +                                                                                                                dNumGroups);
 +
 +                      /*
 +                       * Ignore the path if the hash table won't fit into memory, or
 +                       * if we managed to push dowh the whole aggregation.
 +                       */
 +                      if ((hashaggtablesize < work_mem * 1024L) &&
 +                              (! can_push_down_grouping(root, parse, path)))
 +                      {
 +                              double          total_groups = path->rows * path->parallel_workers;
 +
 +                              path = (Path *) create_gather_path(root,
 +                                                                                                 grouped_rel,
 +                                                                                                 path,
 +                                                                                                 partial_grouping_target,
 +                                                                                                 NULL,
 +                                                                                                 &total_groups);
 +
 +                              path = (Path *) create_agg_path(root,
 +                                                                                              grouped_rel,
 +                                                                                              path,
 +                                                                                              target,
 +                                                                                              AGG_HASHED,
 +                                                                                              AGGSPLIT_COMBINE,
 +                                                                                              parse->groupClause,
 +                                                                                              (List *) parse->havingQual,
 +                                                                                              &agg_final_costs,
 +                                                                                              dNumGroups);
 +
 +                              /* We know the full push down can't happen, so redistribute. */
 +                              path = create_remotesubplan_path(root, path, NULL);
 +
 +                              add_path(grouped_rel, (Path *)
 +                                               create_agg_path(root,
 +                                                                               grouped_rel,
 +                                                                               path,
 +                                                                               target,
 +                                                                               AGG_HASHED,
 +                                                                               AGGSPLIT_FINAL_DESERIAL,
 +                                                                               parse->groupClause,
 +                                                                               (List *) parse->havingQual,
 +                                                                               &agg_final_costs,
 +                                                                               dNumGroups));
 +                      }
 +              }
 +      }
 +
 +      /* Give a helpful error if we failed to find any implementation */
 +      if (grouped_rel->pathlist == NIL)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                               errmsg("could not implement GROUP BY"),
 +                               errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
 +
 +      /*
 +       * If there is an FDW that's responsible for all baserels of the query,
 +       * let it consider adding ForeignPaths.
 +       */
 +      if (grouped_rel->fdwroutine &&
 +              grouped_rel->fdwroutine->GetForeignUpperPaths)
 +              grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
 +                                                                                                        input_rel, grouped_rel);
 +
 +      /* Let extensions possibly add some more paths */
 +      if (create_upper_paths_hook)
 +              (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
 +                                                                      input_rel, grouped_rel);
 +
 +      /* Now choose the best path(s) */
 +      set_cheapest(grouped_rel);
++      /*
++       * We've been using the partial pathlist for the grouped relation to hold
++       * partially aggregated paths, but that's actually a little bit bogus
++       * because it's unsafe for later planning stages -- like ordered_rel ---
++       * to get the idea that they can use these partial paths as if they didn't
++       * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
++       * so we don't get confused.
++       */
++      grouped_rel->partial_pathlist = NIL;
 +
 +      return grouped_rel;
 +}
 +
++
++/*
++ * For a given input path, consider the possible ways of doing grouping sets on
++ * it, by combinations of hashing and sorting.  This can be called multiple
++ * times, so it's important that it not scribble on input.  No result is
++ * returned, but any generated paths are added to grouped_rel.
++ */
++static void
++consider_groupingsets_paths(PlannerInfo *root,
++                                                      RelOptInfo *grouped_rel,
++                                                      Path *path,
++                                                      bool is_sorted,
++                                                      bool can_hash,
++                                                      PathTarget *target,
++                                                      grouping_sets_data *gd,
++                                                      const AggClauseCosts *agg_costs,
++                                                      double dNumGroups)
++{
++      Query      *parse = root->parse;
++
++      /*
++       * If we're not being offered sorted input, then only consider plans that
++       * can be done entirely by hashing.
++       *
++       * We can hash everything if it looks like it'll fit in work_mem. But if
++       * the input is actually sorted despite not being advertised as such, we
++       * prefer to make use of that in order to use less memory.
++       *
++       * If none of the grouping sets are sortable, then ignore the work_mem
++       * limit and generate a path anyway, since otherwise we'll just fail.
++       */
++      if (!is_sorted)
++      {
++              List       *new_rollups = NIL;
++              RollupData *unhashed_rollup = NULL;
++              List       *sets_data;
++              List       *empty_sets_data = NIL;
++              List       *empty_sets = NIL;
++              ListCell   *lc;
++              ListCell   *l_start = list_head(gd->rollups);
++              AggStrategy strat = AGG_HASHED;
++              Size            hashsize;
++              double          exclude_groups = 0.0;
++
++              Assert(can_hash);
++
++              if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
++              {
++                      unhashed_rollup = lfirst(l_start);
++                      exclude_groups = unhashed_rollup->numGroups;
++                      l_start = lnext(l_start);
++              }
++
++              hashsize = estimate_hashagg_tablesize(path,
++                                                                                        agg_costs,
++                                                                                        dNumGroups - exclude_groups);
++
++              /*
++               * gd->rollups is empty if we have only unsortable columns to work
++               * with.  Override work_mem in that case; otherwise, we'll rely on the
++               * sorted-input case to generate usable mixed paths.
++               */
++              if (hashsize > work_mem * 1024L && gd->rollups)
++                      return;                         /* nope, won't fit */
++
++              /*
++               * We need to burst the existing rollups list into individual grouping
++               * sets and recompute a groupClause for each set.
++               */
++              sets_data = list_copy(gd->unsortable_sets);
++
++              for_each_cell(lc, l_start)
++              {
++                      RollupData *rollup = lfirst(lc);
+                       /*
+                        * If we find an unhashable rollup that's not been skipped by the
+                        * "actually sorted" check above, we can't cope; we'd need sorted
+                        * input (with a different sort order) but we can't get that here.
+                        * So bail out; we'll get a valid path from the is_sorted case
+                        * instead.
+                        *
+                        * The mere presence of empty grouping sets doesn't make a rollup
+                        * unhashable (see preprocess_grouping_sets), we handle those
+                        * specially below.
+                        */
+                       if (!rollup->hashable)
+                               return;
+                       else
+                               sets_data = list_concat(sets_data, list_copy(rollup->gsets_data));
+               }
+               foreach(lc, sets_data)
+               {
+                       GroupingSetData *gs = lfirst(lc);
+                       List       *gset = gs->set;
+                       RollupData *rollup;
+                       if (gset == NIL)
+                       {
+                               /* Empty grouping sets can't be hashed. */
+                               empty_sets_data = lappend(empty_sets_data, gs);
+                               empty_sets = lappend(empty_sets, NIL);
+                       }
+                       else
+                       {
+                               rollup = makeNode(RollupData);
+                               rollup->groupClause = preprocess_groupclause(root, gset);
+                               rollup->gsets_data = list_make1(gs);
+                               rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+                                                                                                                rollup->gsets_data,
+                                                                                                  gd->tleref_to_colnum_map);
+                               rollup->numGroups = gs->numGroups;
+                               rollup->hashable = true;
+                               rollup->is_hashed = true;
+                               new_rollups = lappend(new_rollups, rollup);
+                       }
+               }
+               /*
+                * If we didn't find anything nonempty to hash, then bail.  We'll
+                * generate a path from the is_sorted case.
+                */
+               if (new_rollups == NIL)
+                       return;
+               /*
+                * If there were empty grouping sets they should have been in the
+                * first rollup.
+                */
+               Assert(!unhashed_rollup || !empty_sets);
+               if (unhashed_rollup)
+               {
+                       new_rollups = lappend(new_rollups, unhashed_rollup);
+                       strat = AGG_MIXED;
+               }
+               else if (empty_sets)
+               {
+                       RollupData *rollup = makeNode(RollupData);
+                       rollup->groupClause = NIL;
+                       rollup->gsets_data = empty_sets_data;
+                       rollup->gsets = empty_sets;
+                       rollup->numGroups = list_length(empty_sets);
+                       rollup->hashable = false;
+                       rollup->is_hashed = false;
+                       new_rollups = lappend(new_rollups, rollup);
+                       strat = AGG_MIXED;
+               }
+               add_path(grouped_rel, (Path *)
+                                create_groupingsets_path(root,
+                                                                                 grouped_rel,
+                                                                                 path,
+                                                                                 target,
+                                                                                 (List *) parse->havingQual,
+                                                                                 strat,
+                                                                                 new_rollups,
+                                                                                 agg_costs,
+                                                                                 dNumGroups));
+               return;
+       }
+       /*
+        * If we have sorted input but nothing we can do with it, bail.
+        */
+       if (list_length(gd->rollups) == 0)
+               return;
+       /*
+        * Given sorted input, we try and make two paths: one sorted and one mixed
+        * sort/hash. (We need to try both because hashagg might be disabled, or
+        * some columns might not be sortable.)
+        *
+        * can_hash is passed in as false if some obstacle elsewhere (such as
+        * ordered aggs) means that we shouldn't consider hashing at all.
+        */
+       if (can_hash && gd->any_hashable)
+       {
+               List       *rollups = NIL;
+               List       *hash_sets = list_copy(gd->unsortable_sets);
+               double          availspace = (work_mem * 1024.0);
+               ListCell   *lc;
+               /*
+                * Account first for space needed for groups we can't sort at all.
+                */
+               availspace -= (double) estimate_hashagg_tablesize(path,
+                                                                                                                 agg_costs,
+                                                                                                                 gd->dNumHashGroups);
+               if (availspace > 0 && list_length(gd->rollups) > 1)
+               {
+                       double          scale;
+                       int                     num_rollups = list_length(gd->rollups);
+                       int                     k_capacity;
+                       int                *k_weights = palloc(num_rollups * sizeof(int));
+                       Bitmapset  *hash_items = NULL;
+                       int                     i;
+                       /*
+                        * We treat this as a knapsack problem: the knapsack capacity
+                        * represents work_mem, the item weights are the estimated memory
+                        * usage of the hashtables needed to implement a single rollup,
+                        * and we really ought to use the cost saving as the item value;
+                        * however, currently the costs assigned to sort nodes don't
+                        * reflect the comparison costs well, and so we treat all items as
+                        * of equal value (each rollup we hash instead saves us one sort).
+                        *
+                        * To use the discrete knapsack, we need to scale the values to a
+                        * reasonably small bounded range.  We choose to allow a 5% error
+                        * margin; we have no more than 4096 rollups in the worst possible
+                        * case, which with a 5% error margin will require a bit over 42MB
+                        * of workspace. (Anyone wanting to plan queries that complex had
+                        * better have the memory for it.  In more reasonable cases, with
+                        * no more than a couple of dozen rollups, the memory usage will
+                        * be negligible.)
+                        *
+                        * k_capacity is naturally bounded, but we clamp the values for
+                        * scale and weight (below) to avoid overflows or underflows (or
+                        * uselessly trying to use a scale factor less than 1 byte).
+                        */
+                       scale = Max(availspace / (20.0 * num_rollups), 1.0);
+                       k_capacity = (int) floor(availspace / scale);
+                       /*
+                        * We leave the first rollup out of consideration since it's the
+                        * one that matches the input sort order.  We assign indexes "i"
+                        * to only those entries considered for hashing; the second loop,
+                        * below, must use the same condition.
+                        */
+                       i = 0;
+                       for_each_cell(lc, lnext(list_head(gd->rollups)))
+                       {
+                               RollupData *rollup = lfirst(lc);
+                               if (rollup->hashable)
+                               {
+                                       double          sz = estimate_hashagg_tablesize(path,
+                                                                                                                               agg_costs,
+                                                                                                                 rollup->numGroups);
+                                       /*
+                                        * If sz is enormous, but work_mem (and hence scale) is
+                                        * small, avoid integer overflow here.
+                                        */
+                                       k_weights[i] = (int) Min(floor(sz / scale),
+                                                                                        k_capacity + 1.0);
+                                       ++i;
+                               }
+                       }
+                       /*
+                        * Apply knapsack algorithm; compute the set of items which
+                        * maximizes the value stored (in this case the number of sorts
+                        * saved) while keeping the total size (approximately) within
+                        * capacity.
+                        */
+                       if (i > 0)
+                               hash_items = DiscreteKnapsack(k_capacity, i, k_weights, NULL);
+                       if (!bms_is_empty(hash_items))
+                       {
+                               rollups = list_make1(linitial(gd->rollups));
+                               i = 0;
+                               for_each_cell(lc, lnext(list_head(gd->rollups)))
+                               {
+                                       RollupData *rollup = lfirst(lc);
+                                       if (rollup->hashable)
+                                       {
+                                               if (bms_is_member(i, hash_items))
+                                                       hash_sets = list_concat(hash_sets,
+                                                                                         list_copy(rollup->gsets_data));
+                                               else
+                                                       rollups = lappend(rollups, rollup);
+                                               ++i;
+                                       }
+                                       else
+                                               rollups = lappend(rollups, rollup);
+                               }
+                       }
+               }
+               if (!rollups && hash_sets)
+                       rollups = list_copy(gd->rollups);
+               foreach(lc, hash_sets)
+               {
+                       GroupingSetData *gs = lfirst(lc);
+                       RollupData *rollup = makeNode(RollupData);
+                       Assert(gs->set != NIL);
+                       rollup->groupClause = preprocess_groupclause(root, gs->set);
+                       rollup->gsets_data = list_make1(gs);
+                       rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+                                                                                                        rollup->gsets_data,
+                                                                                                  gd->tleref_to_colnum_map);
+                       rollup->numGroups = gs->numGroups;
+                       rollup->hashable = true;
+                       rollup->is_hashed = true;
+                       rollups = lcons(rollup, rollups);
+               }
+               if (rollups)
+               {
+                       add_path(grouped_rel, (Path *)
+                                        create_groupingsets_path(root,
+                                                                                         grouped_rel,
+                                                                                         path,
+                                                                                         target,
+                                                                                         (List *) parse->havingQual,
+                                                                                         AGG_MIXED,
+                                                                                         rollups,
+                                                                                         agg_costs,
+                                                                                         dNumGroups));
+               }
+       }
+       /*
+        * Now try the simple sorted case.
+        */
+       if (!gd->unsortable_sets)
+               add_path(grouped_rel, (Path *)
+                                create_groupingsets_path(root,
+                                                                                 grouped_rel,
+                                                                                 path,
+                                                                                 target,
+                                                                                 (List *) parse->havingQual,
+                                                                                 AGG_SORTED,
+                                                                                 gd->rollups,
+                                                                                 agg_costs,
+                                                                                 dNumGroups));
+ }
  /*
   * create_window_paths
   *
@@@ -6104,255 -6057,32 +6881,286 @@@ plan_cluster_use_sort(Oid tableOid, Oi
        return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
  }
  
 +
 +/*
 + * grouping_distribution_match
 + *    Check if the path distribution matches grouping distribution.
 + *
 + * Grouping preserves distribution if the distribution key is on of the
 + * grouping keys (arbitrary one). In that case it's guaranteed that groups
 + * on different nodes do not overlap, and we can push the aggregation to
 + * remote nodes as a whole.
 + *
 + * Otherwise we need to either fetch all the data to the coordinator and
 + * perform the aggregation there, or use two-phase aggregation, with the
 + * first phase (partial aggregation) pushed down, and the second phase
 + * (combining and finalizing the results) executed on the coordinator.
 + *
 + * XXX This is used not only for plain aggregation, but also for various
 + * other paths, relying on grouping infrastructure (DISTINCT ON, UNIQUE).
 + */
 +static bool
 +grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
 +                                                      List *clauses)
 +{
 +      int             i;
 +      bool    matches_key = false;
 +      Distribution *distribution = path->distribution;
 +
 +      int numGroupCols = list_length(clauses);
 +      AttrNumber *groupColIdx = extract_grouping_cols(clauses,
 +                                                                                                      parse->targetList);
 +
 +      /*
 +       * With no explicit data distribution or replicated tables, we can simply
 +       * push down the whole aggregation to the remote node, without any sort
 +       * of redistribution. So consider this to be a match.
 +       */
 +      if ((distribution == NULL) ||
 +              IsLocatorReplicated(distribution->distributionType))
 +              return true;
 +
 +      /* But no distribution expression means 'no match'. */
 +      if (distribution->distributionExpr == NULL)
 +              return false;
 +
 +      /*
 +       * With distributed data and table distributed using an expression, we
 +       * need to check if the distribution expression matches one of the
 +       * grouping keys (arbitrary one).
 +       */
 +      for (i = 0; i < numGroupCols; i++)
 +      {
 +              TargetEntry *te = (TargetEntry *)list_nth(parse->targetList,
 +                                                                                                groupColIdx[i]-1);
 +
 +              if (equal(te->expr, distribution->distributionExpr))
 +              {
 +                      matches_key = true;
 +                      break;
 +              }
 +      }
 +
 +      return matches_key;
 +}
 +
+ /*
+  * get_partitioned_child_rels
+  *            Returns a list of the RT indexes of the partitioned child relations
+  *            with rti as the root parent RT index.
+  *
+  * Note: Only call this function on RTEs known to be partitioned tables.
+  */
+ List *
+ get_partitioned_child_rels(PlannerInfo *root, Index rti)
+ {
+       List       *result = NIL;
+       ListCell   *l;
+       foreach(l, root->pcinfo_list)
+       {
+               PartitionedChildRelInfo *pc = lfirst(l);
+               if (pc->parent_relid == rti)
+               {
+                       result = pc->child_rels;
+                       break;
+               }
+       }
+       /* The root partitioned table is included as a child rel */
+       Assert(list_length(result) >= 1);
+       return result;
+ }
++
++
 +static bool
 +groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path)
 +{
 +      Distribution *distribution = path->distribution;
 +
 +      /*
 +       * With no explicit data distribution or replicated tables, we can simply
 +       * push down the whole grouping sets to the remote node, without any sort
 +       * of redistribution. So consider this to be a match.
 +       */
 +      if ((distribution == NULL) ||
 +              IsLocatorReplicated(distribution->distributionType))
 +              return true;
 +
 +      return false;
 +}
 +
 +/*
 + * equal_distributions
 + *    Check that two distributions are equal.
 + *
 + * Distributions are considered equal if they are of the same type, on the
 + * same set of nodes, and if the distribution expressions are known to be equal
 + * (either the same expressions or members of the same equivalence class).
 + */
 +static bool
 +equal_distributions(PlannerInfo *root, Distribution *dst1,
 +                                      Distribution *dst2)
 +{
 +      /* fast path */
 +      if (dst1 == dst2)
 +              return true;
 +
 +      if (dst1 == NULL || dst2 == NULL)
 +              return false;
 +
 +      /* conditions easier to check go first */
 +      if (dst1->distributionType != dst2->distributionType)
 +              return false;
 +
 +      if (!bms_equal(dst1->nodes, dst2->nodes))
 +              return false;
 +
 +      if (equal(dst1->distributionExpr, dst2->distributionExpr))
 +              return true;
 +
 +      /*
 +       * For more thorough expression check we need to ensure they both are
 +       * defined
 +       */
 +      if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL)
 +              return false;
 +
 +      /*
 +       * More thorough check, but allows some important cases, like if
 +       * distribution column is not updated (implicit set distcol=distcol) or
 +       * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many
 +       * applications.
 +       */
 +      if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr))
 +              return true;
 +
 +      /* The restrictNodes field does not matter for distribution equality */
 +      return false;
 +}
 +
 +static Path *
 +adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path)
 +{
 +      /* if the root distribution is NULL, set it to path distribution */
 +      if (!root->distribution)
 +      {
 +              root->distribution = path->distribution;
 +              return path;
 +      }
 +
 +      /* don't touch paths without distribution attached (catalogs etc.) */
 +      if ((path->distribution == NULL) && (root->distribution == NULL))
 +              return path;
 +
 +      if (equal_distributions(root, root->distribution, path->distribution))
 +      {
 +              if (IsLocatorReplicated(path->distribution->distributionType) &&
 +                      contain_volatile_functions((Node *) parse->targetList))
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
 +                                      errmsg("can not update replicated table with result of volatile function")));
 +
 +              /*
 +               * Source tuple will be consumed on the same node where it is
 +               * produced, so if it is known that some node does not yield tuples
 +               * we do not want to send subquery for execution on these nodes
 +               * at all. So copy the restriction to the external distribution.
 +               *
 +               * XXX Is that ever possible if external restriction is already
 +               * defined? If yes we probably should use intersection of the sets,
 +               * and if resulting set is empty create dummy plan and set it as
 +               * the result_plan. Need to think this over
 +               */
 +              root->distribution->restrictNodes =
 +                              bms_copy(path->distribution->restrictNodes);
 +      }
 +      else
 +      {
 +              /*
 +               * If the planned statement is either UPDATE or DELETE, different
 +               * distributions here mean the ModifyTable node will be placed on
 +               * top of RemoteSubquery.
 +               *
 +               * UPDATE and DELETE versions of ModifyTable use TID of incoming
 +               * tuple to apply the changes, but the RemoteSubquery plan supplies
 +               * RemoteTuples, without such field. Therefore we can't execute
 +               * such plan and error-out.
 +               *
 +               * Most common example is when the UPDATE statement modifies the
 +               * distribution column, or when a complex UPDATE or DELETE statement
 +               * involves a join. It's difficult to determine the exact reason,
 +               * but we assume the first one (correlated UPDATE) is more likely.
 +               *
 +               * There are two ways of fixing the UPDATE ambiguity:
 +               *
 +               * 1. Modify the planner to never consider redistribution of the
 +               * target table. In this case the planner would find there's no way
 +               * to plan the query, and it would throw error somewhere else, and
 +               * we'd only be dealing with updates of distribution columns.
 +               *
 +               * 2. Modify executor to allow distribution column updates. However
 +               * there are a lot of issues behind the scene when implementing that
 +               * approach, and so it's unlikely to happen soon.
 +               *
 +               * DELETE statements may only fail because of complex joins.
 +               */
 +
 +              if (parse->commandType == CMD_UPDATE)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
 +                                       errmsg("could not plan this distributed update"),
 +                                       errdetail("correlated UPDATE or updating distribution column currently not supported in Postgres-XL.")));
 +
 +              if (parse->commandType == CMD_DELETE)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
 +                                       errmsg("could not plan this distributed delete"),
 +                                       errdetail("correlated or complex DELETE is currently not supported in Postgres-XL.")));
 +
 +              /*
 +               * We already know the distributions are not equal, but let's see if
 +               * the redistribution is actually necessary. We can skip it if we
 +               * already have Result path, and if the distribution is one of
 +               *
 +               * a) 'hash' restricted to a single node
 +               * b) 'replicate' without volatile functions in the target list
 +               *
 +               * In those cases we don't need the RemoteSubplan.
 +               *
 +               * XXX Not sure what the (result_plan->lefttree == NULL) does.
 +               * See planner.c:2730 in 9.5.
 +               */
 +              if (!(IsA(path, ResultPath) && /* FIXME missing (result_plan->lefttree == NULL) condition */
 +                      ((root->distribution->distributionType == 'H' && bms_num_members(root->distribution->restrictNodes) == 1) ||
 +                       (root->distribution->distributionType == 'R' && !contain_mutable_functions((Node *)parse->targetList)))))
 +
 +                      path = create_remotesubplan_path(root, path, root->distribution);
 +      }
 +
 +      return path;
 +}
 +
 +static bool
 +can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path)
 +{
 +      /* only called when constructing grouping paths */
 +      Assert(parse->hasAggs || parse->groupClause);
 +
 +      if (parse->groupingSets)
 +              return groupingsets_distribution_match(root, parse, path);
 +
 +      return grouping_distribution_match(root, parse, path, parse->groupClause);
 +}
 +
 +static bool
 +can_push_down_window(PlannerInfo *root, Path *path)
 +{
 +      /*  */
 +      if (! path->distribution)
 +              return true;
 +
 +      return false;
 +}
index d5bc9e07602bb99de6fdf64ef57bf496be86c5e0,5cac171cb6e411cb5ab4deac14bcfefcb4ad29f7..398586e98acdca850ec390ec7e8b80a06734acd1
@@@ -4,8 -4,7 +4,8 @@@
   *      Post-processing of a completed plan tree: fix references to subplan
   *      vars, compute regproc values for operators, etc
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index bc2cbcee6b335b5f9c5b7a86e6f82348de043443,c1be34dd12c8cdcbbe96b910c7c652885d936dc2..d8545f2bdd6e51a4f759ee0ca744f5a9bfdf7553
@@@ -3,8 -3,7 +3,8 @@@
   * subselect.c
   *      Planning routines for subselects and parameters.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index 5fa672d02cc9388d3fb1b48fb0a301e499494571,749ea805f824a717943267cb58b2ce97852af682..41a930428faeb336fe4ff37d2f5b2accd7218e22
@@@ -1125,9 -1123,7 +1127,10 @@@ pull_up_simple_subquery(PlannerInfo *ro
                                        break;
                                case RTE_JOIN:
                                case RTE_CTE:
 +#ifdef XCP    
 +                              case RTE_REMOTE_DUMMY:
 +#endif
+                               case RTE_NAMEDTUPLESTORE:
                                        /* these can't contain any lateral references */
                                        break;
                        }
@@@ -1985,9 -1978,7 +1985,10 @@@ replace_vars_in_jointree(Node *jtnode
                                                break;
                                        case RTE_JOIN:
                                        case RTE_CTE:
 +#ifdef XCP
 +                                      case RTE_REMOTE_DUMMY:
 +#endif                                        
+                                       case RTE_NAMEDTUPLESTORE:
                                                /* these shouldn't be marked LATERAL */
                                                Assert(false);
                                                break;
index 64cd7262d07a2ef8ca9e12cd438e5e78a8f459c5,de47153bacef43eec0736043e6beb6fa6159823c..4d4727278147cc5e06cfd057c647eb1313eb9d40
@@@ -26,8 -26,8 +26,8 @@@
   * the tlists for child tables to keep expand_targetlist happy.  We do it like
   * that because it's faster in typical non-inherited cases.
   *
 - *
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index 25226363920ed9772175534d69c4336f38ec321b,8b44fb96b08c6ca0c4d6ffaf002b7af6e8544465..66c684c065bda61fbe24a958261672d6b9ddc59c
@@@ -17,8 -17,7 +17,8 @@@
   * append relations, and thenceforth share code with the UNION ALL case.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 971ffa882268ef1899c5a899336c10329abb1b39,ec4a093d9fb375c5ff6d5b944b5e4f783d851dfd..0ccf4bd47da594e205f4233c5bf88d5a052ed35e
@@@ -3,8 -3,7 +3,8 @@@
   * pathnode.c
   *      Routines to manipulate pathlists and create path nodes
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -2220,19 -1046,7 +2217,19 @@@ create_index_path(PlannerInfo *root
        pathnode->indexorderbycols = indexorderbycols;
        pathnode->indexscandir = indexscandir;
  
-       cost_index(pathnode, root, loop_count);
 +#ifdef XCP
 +      set_scanpath_distribution(root, rel, (Path *) pathnode);
 +      if (indexclauses)
 +      {
 +              ListCell *lc;
 +              foreach (lc, indexclauses)
 +              {
 +                      RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
 +                      restrict_distribution(root, ri, (Path *) pathnode);
 +              }
 +      }
 +#endif
+       cost_index(pathnode, root, loop_count, partial_path);
  
        return pathnode;
  }
@@@ -2433,75 -1216,7 +2431,77 @@@ create_append_path(RelOptInfo *rel, Lis
        pathnode->path.parallel_workers = parallel_workers;
        pathnode->path.pathkeys = NIL;          /* result is always considered
                                                                                 * unsorted */
 +#ifdef XCP
 +      /*
 +       * Append path is used to implement scans of inherited tables and some
 +       * "set" operations, like UNION ALL. While all inherited tables should
 +       * have the same distribution, UNION'ed queries may have different.
 +       * When paths being appended have the same distribution it is OK to push
 +       * Append down to the data nodes. If not, perform "coordinator" Append.
 +       */
 +
 +      /* Special case of the dummy relation, if the subpaths list is empty */
 +      if (subpaths)
 +      {
 +              /* Take distribution of the first node */
 +              l = list_head(subpaths);
 +              subpath = (Path *) lfirst(l);
 +              distribution = copyObject(subpath->distribution);
 +              /*
 +               * Check remaining subpaths, if all distributions equal to the first set
 +               * it as a distribution of the Append path; otherwise make up coordinator
 +               * Append
 +               */
 +              while ((l = lnext(l)))
 +              {
 +                      subpath = (Path *) lfirst(l);
 +
 +                      /*
 +                       * For Append and MergeAppend paths, we are most often dealing with
 +                       * different relations, appended together. So its very likely that
 +                       * the distribution for each relation will have a different varno.
 +                       * But we should be able to push down Append and MergeAppend as
 +                       * long as rest of the distribution information matches.
 +                       *
 +                       * equalDistribution() compares everything except the varnos
 +                       */
 +                      if (equalDistribution(distribution, subpath->distribution))
 +                      {
 +                              /*
 +                               * Both distribution and subpath->distribution may be NULL at
 +                               * this point, or they both are not null.
 +                               */
 +                              if (distribution && subpath->distribution->restrictNodes)
 +                                      distribution->restrictNodes = bms_union(
 +                                                      distribution->restrictNodes,
 +                                                      subpath->distribution->restrictNodes);
 +                      }
 +                      else
 +                      {
 +                              break;
 +                      }
 +              }
 +              if (l)
 +              {
 +                      List *newsubpaths = NIL;
 +                      foreach(l, subpaths)
 +                      {
 +                              subpath = (Path *) lfirst(l);
 +                              if (subpath->distribution)
 +                                      subpath = redistribute_path(NULL, subpath, NIL,
 +                                                                                              LOCATOR_TYPE_NONE, NULL,
 +                                                                                              NULL, NULL);
 +                              newsubpaths = lappend(newsubpaths, subpath);
 +                      }
 +                      subpaths = newsubpaths;
 +                      pathnode->path.distribution = NULL;
 +              }
 +              else
 +                      pathnode->path.distribution = distribution;
 +      }
 +#endif
++
+       pathnode->partitioned_rels = list_copy(partitioned_rels);
        pathnode->subpaths = subpaths;
  
        /*
@@@ -3046,19 -1742,17 +3108,20 @@@ create_gather_path(PlannerInfo *root, R
                                                                                                                  required_outer);
        pathnode->path.parallel_aware = false;
        pathnode->path.parallel_safe = false;
-       pathnode->path.parallel_workers = subpath->parallel_workers;
+       pathnode->path.parallel_workers = 0;
        pathnode->path.pathkeys = NIL;          /* Gather has unordered result */
  
 +      /* distribution is the same as in the subpath */
 +      pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
 +
        pathnode->subpath = subpath;
+       pathnode->num_workers = subpath->parallel_workers;
        pathnode->single_copy = false;
  
-       if (pathnode->path.parallel_workers == 0)
+       if (pathnode->num_workers == 0)
        {
-               pathnode->path.parallel_workers = 1;
                pathnode->path.pathkeys = subpath->pathkeys;
+               pathnode->num_workers = 1;
                pathnode->single_copy = true;
        }
  
@@@ -3395,26 -2122,8 +3509,26 @@@ create_nestloop_path(PlannerInfo *root
        pathnode->innerjoinpath = inner_path;
        pathnode->joinrestrictinfo = restrict_clauses;
  
-       final_cost_nestloop(root, pathnode, workspace, sjinfo, semifactors);
 +#ifdef XCP
 +      pathnode->movedrestrictinfo = mclauses;
 +
 +      alternate = set_joinpath_distribution(root, pathnode);
 +#endif
+       final_cost_nestloop(root, pathnode, workspace, extra);
  
-               final_cost_nestloop(root, altpath, workspace, sjinfo, semifactors);
 +#ifdef XCP
 +      /*
 +       * Also calculate costs of all alternates and return cheapest path
 +       */
 +      foreach(lc, alternate)
 +      {
 +              NestPath *altpath = (NestPath *) lfirst(lc);
++              final_cost_nestloop(root, altpath, workspace, extra);
 +              if (altpath->path.total_cost < pathnode->path.total_cost)
 +                      pathnode = altpath;
 +      }
 +#endif
 +
        return pathnode;
  }
  
@@@ -3482,25 -2188,11 +3597,27 @@@ create_mergejoin_path(PlannerInfo *root
        pathnode->path_mergeclauses = mergeclauses;
        pathnode->outersortkeys = outersortkeys;
        pathnode->innersortkeys = innersortkeys;
 +#ifdef XCP
 +      alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
 +#endif
+       /* pathnode->skip_mark_restore will be set by final_cost_mergejoin */
        /* pathnode->materialize_inner will be set by final_cost_mergejoin */
-       final_cost_mergejoin(root, pathnode, workspace, sjinfo);
+       final_cost_mergejoin(root, pathnode, workspace, extra);
  
-               final_cost_mergejoin(root, altpath, workspace, sjinfo);
 +#ifdef XCP
 +      /*
 +       * Also calculate costs of all alternates and return cheapest path
 +       */
 +      foreach(lc, alternate)
 +      {
 +              MergePath *altpath = (MergePath *) lfirst(lc);
++              final_cost_mergejoin(root, altpath, workspace, extra);
 +              if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
 +                      pathnode = altpath;
 +      }
 +#endif
 +
        return pathnode;
  }
  
@@@ -3573,25 -2260,10 +3689,26 @@@ create_hashjoin_path(PlannerInfo *root
        pathnode->jpath.innerjoinpath = inner_path;
        pathnode->jpath.joinrestrictinfo = restrict_clauses;
        pathnode->path_hashclauses = hashclauses;
 +#ifdef XCP
 +      alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
 +#endif
        /* final_cost_hashjoin will fill in pathnode->num_batches */
-       final_cost_hashjoin(root, pathnode, workspace, sjinfo, semifactors);
+       final_cost_hashjoin(root, pathnode, workspace, extra);
  
-               final_cost_hashjoin(root, altpath, workspace, sjinfo, semifactors);
 +#ifdef XCP
 +      /*
 +       * Calculate costs of all alternates and return cheapest path
 +       */
 +      foreach(lc, alternate)
 +      {
 +              HashPath *altpath = (HashPath *) lfirst(lc);
++              final_cost_hashjoin(root, altpath, workspace, extra);
 +              if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
 +                      pathnode = altpath;
 +      }
 +#endif
 +
        return pathnode;
  }
  
@@@ -4029,9 -2755,19 +4213,22 @@@ create_groupingsets_path(PlannerInfo *r
        pathnode->path.parallel_workers = subpath->parallel_workers;
        pathnode->subpath = subpath;
  
 +      /* distribution is the same as in the subpath */
 +      pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
 +
+       /*
+        * Simplify callers by downgrading AGG_SORTED to AGG_PLAIN, and AGG_MIXED
+        * to AGG_HASHED, here if possible.
+        */
+       if (aggstrategy == AGG_SORTED &&
+               list_length(rollups) == 1 &&
+               ((RollupData *) linitial(rollups))->groupClause == NIL)
+               aggstrategy = AGG_PLAIN;
+       if (aggstrategy == AGG_MIXED &&
+               list_length(rollups) == 1)
+               aggstrategy = AGG_HASHED;
        /*
         * Output will be in sorted order by group_pathkeys if, and only if, there
         * is a single rollup operation on a non-empty list of grouping
@@@ -4669,21 -3409,9 +4889,21 @@@ reparameterize_path(PlannerInfo *root, 
                                                                                                                rel,
                                                                                                                bpath->bitmapqual,
                                                                                                                required_outer,
-                                                                                                               loop_count);
+                                                                                                               loop_count, 0);
                        }
                case T_SubqueryScan:
 +#ifdef XCP
 +                      {
 +                              SubqueryScanPath *spath = (SubqueryScanPath *) path;
 +
 +                              return (Path *) create_subqueryscan_path(root,
 +                                                                                                               rel,
 +                                                                                                               spath->subpath,
 +                                                                                                               spath->path.pathkeys,
 +                                                                                                               required_outer,
 +                                                                                                               path->distribution);
 +                      }
 +#else
                        {
                                SubqueryScanPath *spath = (SubqueryScanPath *) path;
  
index 2b50919b10887bce1f535f5b943962ae701cbbd7,8f9dd9099b0c4ea16a46a7c4ba44eb1cf7cebb5c..aa8f6cf02024fefdfb7cc35371e13a121b834013
@@@ -4,8 -4,7 +4,8 @@@
   *       routines for accessing the system catalogs
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "parser/parse_relation.h"
  #include "parser/parsetree.h"
  #include "rewrite/rewriteManip.h"
+ #include "statistics/statistics.h"
  #include "storage/bufmgr.h"
+ #include "utils/builtins.h"
  #include "utils/lsyscache.h"
+ #include "utils/syscache.h"
  #include "utils/rel.h"
  #include "utils/snapmgr.h"
 -
 +#ifdef PGXC
 +#include "pgxc/pgxc.h"
 +#endif
  
  /* GUC parameter */
  int                   constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION;
Simple merge
index 90603dd5e556cedda9f986b6012ec39f4ca96580,86482eba26ee894cda3edb665190470b240456e8..020d6f74c4b25884cce849bbccf90420bd7cc753
@@@ -14,8 -14,7 +14,8 @@@
   * contain optimizable statements, which we should transform.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *    src/backend/parser/analyze.c
@@@ -2540,173 -2572,6 +2624,174 @@@ transformCreateTableAsStmt(ParseState *
        return result;
  }
  
-               Node   *parsetree = (Node *) lfirst(raw_parsetree_item);
-               result = parse_analyze(parsetree, query, NULL, 0);
 +#ifdef PGXC
 +/*
 + * transformExecDirectStmt -
 + *    transform an EXECUTE DIRECT Statement
 + *
 + * Handling is depends if we should execute on nodes or on Coordinator.
 + * To execute on nodes we return CMD_UTILITY query having one T_RemoteQuery node
 + * with the inner statement as a sql_command.
 + * If statement is to run on Coordinator we should parse inner statement and
 + * analyze resulting query tree.
 + */
 +static Query *
 +transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
 +{
 +      Query           *result = makeNode(Query);
 +      char            *query = stmt->query;
 +      List            *nodelist = stmt->node_names;
 +      RemoteQuery     *step = makeNode(RemoteQuery);
 +      bool            is_local = false;
 +      List            *raw_parsetree_list;
 +      ListCell        *raw_parsetree_item;
 +      char            *nodename;
 +      int                     nodeIndex;
 +      char            nodetype;
 +
 +      /* Support not available on Datanodes */
 +      if (IS_PGXC_DATANODE)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                               errmsg("EXECUTE DIRECT cannot be executed on a Datanode")));
 +
 +      if (list_length(nodelist) > 1)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                               errmsg("Support for EXECUTE DIRECT on multiple nodes is not available yet")));
 +
 +      Assert(list_length(nodelist) == 1);
 +      Assert(IS_PGXC_COORDINATOR);
 +
 +      /* There is a single element here */
 +      nodename = strVal(linitial(nodelist));
 +#ifdef XCP
 +      nodetype = PGXC_NODE_NONE;
 +      nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
 +      if (nodetype == PGXC_NODE_NONE)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                               errmsg("PGXC Node %s: object not defined",
 +                                              nodename)));
 +#else
 +      nodeoid = get_pgxc_nodeoid(nodename);
 +
 +      if (!OidIsValid(nodeoid))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                               errmsg("PGXC Node %s: object not defined",
 +                                              nodename)));
 +
 +      /* Get node type and index */
 +      nodetype = get_pgxc_nodetype(nodeoid);
 +      nodeIndex = PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid));
 +#endif
 +
 +      /* Check if node is requested is the self-node or not */
 +      if (nodetype == PGXC_NODE_COORDINATOR && nodeIndex == PGXCNodeId - 1)
 +              is_local = true;
 +
 +      /* Transform the query into a raw parse list */
 +      raw_parsetree_list = pg_parse_query(query);
 +
 +      /* EXECUTE DIRECT can just be executed with a single query */
 +      if (list_length(raw_parsetree_list) > 1)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                               errmsg("EXECUTE DIRECT cannot execute multiple queries")));
 +
 +      /*
 +       * Analyze the Raw parse tree
 +       * EXECUTE DIRECT is restricted to one-step usage
 +       */
 +      foreach(raw_parsetree_item, raw_parsetree_list)
 +      {
++              RawStmt   *parsetree = lfirst_node(RawStmt, raw_parsetree_item);
++              List *result_list = pg_analyze_and_rewrite(parsetree, query, NULL, 0, NULL);
++              result = linitial_node(Query, result_list);
 +      }
 +
 +      /* Default list of parameters to set */
 +      step->sql_statement = NULL;
 +      step->exec_nodes = makeNode(ExecNodes);
 +      step->combine_type = COMBINE_TYPE_NONE;
 +      step->sort = NULL;
 +      step->read_only = true;
 +      step->force_autocommit = false;
 +      step->cursor = NULL;
 +
 +      /* This is needed by executor */
 +      step->sql_statement = pstrdup(query);
 +      if (nodetype == PGXC_NODE_COORDINATOR)
 +              step->exec_type = EXEC_ON_COORDS;
 +      else
 +              step->exec_type = EXEC_ON_DATANODES;
 +
 +      step->reduce_level = 0;
 +      step->base_tlist = NIL;
 +      step->outer_alias = NULL;
 +      step->inner_alias = NULL;
 +      step->outer_reduce_level = 0;
 +      step->inner_reduce_level = 0;
 +      step->outer_relids = NULL;
 +      step->inner_relids = NULL;
 +      step->inner_statement = NULL;
 +      step->outer_statement = NULL;
 +      step->join_condition = NULL;
 +
 +      /* Change the list of nodes that will be executed for the query and others */
 +      step->force_autocommit = false;
 +      step->combine_type = COMBINE_TYPE_SAME;
 +      step->read_only = true;
 +      step->exec_direct_type = EXEC_DIRECT_NONE;
 +
 +      /* Set up EXECUTE DIRECT flag */
 +      if (is_local)
 +      {
 +              if (result->commandType == CMD_UTILITY)
 +                      step->exec_direct_type = EXEC_DIRECT_LOCAL_UTILITY;
 +              else
 +                      step->exec_direct_type = EXEC_DIRECT_LOCAL;
 +      }
 +      else
 +      {
 +              switch(result->commandType)
 +              {
 +                      case CMD_UTILITY:
 +                              step->exec_direct_type = EXEC_DIRECT_UTILITY;
 +                              break;
 +                      case CMD_SELECT:
 +                              step->exec_direct_type = EXEC_DIRECT_SELECT;
 +                              break;
 +                      case CMD_INSERT:
 +                              step->exec_direct_type = EXEC_DIRECT_INSERT;
 +                              break;
 +                      case CMD_UPDATE:
 +                              step->exec_direct_type = EXEC_DIRECT_UPDATE;
 +                              break;
 +                      case CMD_DELETE:
 +                              step->exec_direct_type = EXEC_DIRECT_DELETE;
 +                              break;
 +                      default:
 +                              Assert(0);
 +              }
 +      }
 +
 +      /* Build Execute Node list, there is a unique node for the time being */
 +      step->exec_nodes->nodeList = lappend_int(step->exec_nodes->nodeList, nodeIndex);
 +
 +      if (!is_local)
 +              result->utilityStmt = (Node *) step;
 +
 +      /*
 +       * Reset the queryId since the caller would do that anyways.
 +       */
 +      result->queryId = 0;
 +
 +      return result;
 +}
 +
 +#endif
  
  /*
   * Produce a string representation of a LockClauseStrength value.
index 54af97691772594072be56228bf106506084bc15,7e03624eb45a1617b69e7e5767f39b34b612cd13..7fa2f21e3f4f71f4362754ca21abf24cb99c90d3
@@@ -6,9 -6,8 +6,9 @@@
   * gram.y
   *      POSTGRESQL BISON rules/actions
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -239,11 -237,10 +249,15 @@@ static Node *makeRecursiveViewSelect(ch
        struct ImportQual       *importqual;
        InsertStmt                      *istmt;
        VariableSetStmt         *vsetstmt;
 +/* PGXC_BEGIN */
 +      struct StmtMulti                        *stmtmulti;
 +      DistributeBy            *distby;
 +      PGXCSubCluster          *subclus;
 +/* PGXC_END */
+       PartitionElem           *partelem;
+       PartitionSpec           *partspec;
+       PartitionBoundSpec      *partboundspec;
+       RoleSpec                        *rolespec;
  }
  
  %type <node>  stmt schema_stmt
                CreateUserStmt CreateUserMappingStmt CreateRoleStmt CreatePolicyStmt
                CreatedbStmt DeclareCursorStmt DefineStmt DeleteStmt DiscardStmt DoStmt
                DropGroupStmt DropOpClassStmt DropOpFamilyStmt DropPLangStmt DropStmt
-               DropAssertStmt DropTrigStmt DropRuleStmt DropCastStmt DropRoleStmt
-               DropPolicyStmt DropUserStmt DropdbStmt DropTableSpaceStmt DropFdwStmt
+               DropAssertStmt DropCastStmt DropRoleStmt
+               DropUserStmt DropdbStmt DropTableSpaceStmt
                DropTransformStmt
-               DropForeignServerStmt DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
 -              DropUserMappingStmt ExplainStmt FetchStmt
++              DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
                GrantStmt GrantRoleStmt ImportForeignSchemaStmt IndexStmt InsertStmt
                ListenStmt LoadStmt LockStmt NotifyStmt ExplainableStmt PreparableStmt
                CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt
                DeallocateStmt PrepareStmt ExecuteStmt
                DropOwnedStmt ReassignOwnedStmt
                AlterTSConfigurationStmt AlterTSDictionaryStmt
 +              BarrierStmt PauseStmt AlterNodeStmt CreateNodeStmt DropNodeStmt
 +              CreateNodeGroupStmt DropNodeGroupStmt
                CreateMatViewStmt RefreshMatViewStmt CreateAmStmt
+               CreatePublicationStmt AlterPublicationStmt
+               CreateSubscriptionStmt AlterSubscriptionStmt DropSubscriptionStmt
  
  %type <node>  select_no_parens select_with_parens select_clause
                                simple_select values_clause
  %type <windef>        window_definition over_clause window_specification
                                opt_frame_clause frame_extent frame_bound
  %type <str>           opt_existing_window_name
 +/* PGXC_BEGIN */
 +%type <str>           opt_barrier_id OptDistributeType DistributeStyle OptDistKey
 +%type <distby>        OptDistributeBy OptDistributeByInternal
 +%type <subclus> OptSubCluster OptSubClusterInternal
 +/* PGXC_END */
  %type <boolean> opt_if_not_exists
+ %type <ival>  generated_when override_kind
+ %type <partspec>      PartitionSpec OptPartitionSpec
+ %type <str>                   part_strategy
+ %type <partelem>      part_elem
+ %type <list>          part_params
+ %type <partboundspec> ForValues
+ %type <node>          partbound_datum PartitionRangeDatum
+ %type <list>          partbound_datum_list range_datum_list
  
  /*
   * Non-keyword token types.  These are hard-wired into the "flex" lexer.
   */
  
  /* ordinary key words in alphabetical order */
 +/* PGXC - added DISTRIBUTE, DISTRIBUTED, DISTSYLE, DISTKEY, RANDOMLY, DIRECT, COORDINATOR, CLEAN,  NODE, BARRIER */
  %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER
        AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC
-       ASSERTION ASSIGNMENT ASYMMETRIC AT ATTRIBUTE AUTHORIZATION
+       ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
  
 -      BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
 +      BACKWARD BARRIER BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
        BOOLEAN_P BOTH BY
  
        CACHE CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
 -      CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
 +      CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLEAN CLOSE
-       CLUSTER COALESCE COLLATE COLLATION COLUMN COMMENT COMMENTS COMMIT
-       COMMITTED CONCURRENTLY CONFIGURATION CONFLICT CONNECTION CONSTRAINT CONSTRAINTS
-       CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE
+       CLUSTER COALESCE COLLATE COLLATION COLUMN COLUMNS COMMENT COMMENTS COMMIT
+       COMMITTED CONCURRENTLY CONFIGURATION CONFLICT CONNECTION CONSTRAINT
 -      CONSTRAINTS CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE
++      CONSTRAINTS CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE
        CROSS CSV CUBE CURRENT_P
        CURRENT_CATALOG CURRENT_DATE CURRENT_ROLE CURRENT_SCHEMA
        CURRENT_TIME CURRENT_TIMESTAMP CURRENT_USER CURSOR CYCLE
  
        DATA_P DATABASE DAY_P DEALLOCATE DEC DECIMAL_P DECLARE DEFAULT DEFAULTS
        DEFERRABLE DEFERRED DEFINER DELETE_P DELIMITER DELIMITERS DEPENDS DESC
- /* PGXC_BEGIN */
-       DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTKEY DISTRIBUTE DISTRIBUTED
-       DISTSTYLE DO DOCUMENT_P DOMAIN_P DOUBLE_P
- /* PGXC_END */
-       DROP
 -      DETACH DICTIONARY DISABLE_P DISCARD DISTINCT DO DOCUMENT_P DOMAIN_P
++      DETACH DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTKEY DISTRIBUTE DISTRIBUTED DISTSTYLE DO DOCUMENT_P DOMAIN_P
+       DOUBLE_P DROP
  
        EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EVENT EXCEPT
        EXCLUDE EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN
  
        MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
  
-       NAME_P NAMES NATIONAL NATURAL NCHAR NEXT NO NODE NONE
 -      NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
++      NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NODE NONE
        NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
        NULLS_P NUMERIC
  
-       OBJECT_P OF OFF OFFSET OIDS ON ONLY OPERATOR OPTION OPTIONS OR
-       ORDER ORDINALITY OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER
+       OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR
+       ORDER ORDINALITY OUT_P OUTER_P OVER OVERLAPS OVERLAY OVERRIDING OWNED OWNER
  
 -      PARALLEL PARSER PARTIAL PARTITION PASSING PASSWORD PLACING PLANS POLICY
 -      POSITION PRECEDING PRECISION PRESERVE PREPARE PREPARED PRIMARY
 +      PARALLEL PARSER PARTIAL PARTITION PASSING PASSWORD PAUSE PLACING PLANS POLICY
 +      POSITION PRECEDING PRECISION PREFERRED PRESERVE PREPARE PREPARED PRIMARY
-       PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM
+       PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM PUBLICATION
  
        QUOTE
  
-       RANDOMLY RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFRESH REINDEX
-       RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA
 -      RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFERENCING
++      RANDOMLY RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFERENCING
+       REFRESH REINDEX RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA
        RESET RESTART RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP
        ROW ROWS RULE
  
@@@ -774,85 -778,34 +811,112 @@@ stmtblock:     stmtmult
                        }
                ;
  
- /* the thrashing around here is to discard "empty" statements... */
+ /*
+  * At top level, we wrap each stmt with a RawStmt node carrying start location
+  * and length of the stmt's text.  Notice that the start loc/len are driven
+  * entirely from semicolon locations (@2).  It would seem natural to use
+  * @1 or @3 to get the true start location of a stmt, but that doesn't work
+  * for statements that can start with empty nonterminals (opt_with_clause is
+  * the main offender here); as noted in the comments for YYLLOC_DEFAULT,
+  * we'd get -1 for the location in such cases.
+  * We also take care to discard empty statements entirely.
+  */
  stmtmulti:    stmtmulti ';' stmt
                                {
++                                      /* 
++                                       * XXX PG10MERGE: Looks like support for obtaining raw
++                                       * query string for individual commands is added in PG10.
++                                       * If so, we can make use of the same infrastructure.
++                                       *
++                                       * XXX The following gives a compilation WARNING because
++                                       * stmtmulti is defined as a List in PG10, but we have our
++                                       * own definition.
++                                       */
+                                       if ($1 != NIL)
+                                       {
+                                               /* update length of previous stmt */
+                                               updateRawStmtEnd(llast_node(RawStmt, $1), @2);
+                                       }
 +                                      if ($3 != NULL)
 +                                      {
 +                                              char *query;
 +                                              ListCell *last;
 +                                              /*
 +                                               * Because of the way multi-commands are parsed by the
 +                                               * parser, when the earlier command was parsed and
 +                                               * reduced to a 'stmtmulti', we did not have the
 +                                               * end-of-the-query marker. But now that we have seen
 +                                               * the ';' token, add '\0' at the corresponding offset
 +                                               * to get a separated command.
 +                                               */
 +                                              if ($1 != NULL)
 +                                              {
 +                                                      last = list_tail($1->queries);
 +                                                      query = palloc(@2 - $1->offset + 1);
 +                                                      memcpy(query, lfirst(last), @2 - $1->offset);
 +                                                      query[@2 - $1->offset] = '\0';
 +                                                      lfirst(last) = query;
 +
 +                                                      query = scanner_get_query(@3, -1, yyscanner);
 +                                                      $1->offset = @2;
 +                                                      $1->parsetrees = lappend($1->parsetrees, $3);
 +                                                      $1->queries = lappend($1->queries, query);
 +                                                      $$ = $1;
 +                                              }
 +                                              /*
 +                                               *
 +                                               * If the earlier statements were all null, then we
 +                                               * must initialise the StmtMulti structure and make
 +                                               * singleton lists
 +                                               */
 +                                              else
 +                                              {
 +                                                      StmtMulti *n = (StmtMulti *) palloc0(sizeof (StmtMulti));
 +                                                      query = scanner_get_query(@3, -1, yyscanner);
 +                                                      n->offset = @2;
 +                                                      n->parsetrees = list_make1($3);
 +                                                      n->queries = list_make1(query);
 +                                                      $$ = n;
 +                                              }
 +                                      }
+                                       if ($3 != NULL)
+                                               $$ = lappend($1, makeRawStmt($3, @2 + 1));
                                        else
                                                $$ = $1;
                                }
                        | stmt
                                {
 +                                      if ($1 != NULL)
 +                                      {
 +                                              StmtMulti *n = (StmtMulti *) palloc0(sizeof (StmtMulti));
 +                                              char *query = scanner_get_query(@1, -1, yyscanner);
 +
 +                                              /*
 +                                               * Keep track of the offset where $1 started. We don't
 +                                               * have the offset where it ends so we copy the entire
 +                                               * query to the end. If later, we find a ';' followed
 +                                               * by another command, we'll add the '\0' at the
 +                                               * appropriate offset
 +                                               *
 +                                               * XXX May be there is a better way to get the matching  
 +                                               * portion of the query string, but this does the trick
 +                                               * for regression as well as the problem we are trying
 +                                               * to solve with multi-command queries
 +                                               */
 +                                              n->offset = @1;
 +
 +                                              /*
 +                                               * Collect both parsetree as well as the original query
 +                                               * that resulted in the parsetree
 +                                               */
 +                                              n->parsetrees = list_make1($1);
 +                                              n->queries = list_make1(query);
 +                                              $$ = n;
 +                                      }
+                                       if ($1 != NULL)
+                                               $$ = list_make1(makeRawStmt($1, 0));
                                        else
 -                                              $$ = NIL;
 +                                              $$ = NULL;
                                }
                ;
  
@@@ -910,10 -863,9 +977,11 @@@ stmt 
                        | CreateFunctionStmt
                        | CreateGroupStmt
                        | CreateMatViewStmt
 +                      | CreateNodeGroupStmt
 +                      | CreateNodeStmt
                        | CreateOpClassStmt
                        | CreateOpFamilyStmt
+                       | CreatePublicationStmt
                        | AlterOpFamilyStmt
                        | CreatePolicyStmt
                        | CreatePLangStmt
                        | DoStmt
                        | DropAssertStmt
                        | DropCastStmt
-                       | DropFdwStmt
-                       | DropForeignServerStmt
                        | DropGroupStmt
 +                      | DropNodeGroupStmt
 +                      | DropNodeStmt
                        | DropOpClassStmt
                        | DropOpFamilyStmt
                        | DropOwnedStmt
@@@ -2958,74 -3062,43 +3223,77 @@@ copy_generic_opt_arg_list_item
   *****************************************************************************/
  
  CreateStmt:   CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
-                       OptInherit OptWith OnCommitOption OptTableSpace
+                       OptInherit OptPartitionSpec OptWith OnCommitOption OptTableSpace
 +/* PGXC_BEGIN */
 +                      OptDistributeBy OptSubCluster
 +/* PGXC_END */
                                {
                                        CreateStmt *n = makeNode(CreateStmt);
                                        $4->relpersistence = $2;
                                        n->relation = $4;
                                        n->tableElts = $6;
                                        n->inhRelations = $8;
+                                       n->partspec = $9;
                                        n->ofTypename = NULL;
                                        n->constraints = NIL;
-                                       n->options = $9;
-                                       n->oncommit = $10;
-                                       n->tablespacename = $11;
+                                       n->options = $10;
+                                       n->oncommit = $11;
+                                       n->tablespacename = $12;
                                        n->if_not_exists = false;
-                                       n->distributeby = $12;
-                                       n->subcluster = $13;
 +/* PGXC_BEGIN */
 +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
 +                                      {
 +                                              $4->relpersistence = RELPERSISTENCE_TEMP;
 +                                              n->islocal = true;
 +                                      }
 +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $13;
++                                      n->subcluster = $14;
 +/* PGXC_END */
                                        $$ = (Node *)n;
                                }
                | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name '('
-                       OptTableElementList ')' OptInherit OptWith OnCommitOption
-                       OptTableSpace
+                       OptTableElementList ')' OptInherit OptPartitionSpec OptWith
+                       OnCommitOption OptTableSpace
 +/* PGXC_BEGIN */
 +                      OptDistributeBy OptSubCluster
 +/* PGXC_END */
                                {
                                        CreateStmt *n = makeNode(CreateStmt);
                                        $7->relpersistence = $2;
                                        n->relation = $7;
                                        n->tableElts = $9;
                                        n->inhRelations = $11;
+                                       n->partspec = $12;
                                        n->ofTypename = NULL;
                                        n->constraints = NIL;
-                                       n->options = $12;
-                                       n->oncommit = $13;
-                                       n->tablespacename = $14;
+                                       n->options = $13;
+                                       n->oncommit = $14;
+                                       n->tablespacename = $15;
                                        n->if_not_exists = true;
-                                       n->distributeby = $15;
-                                       n->subcluster = $16;
 +/* PGXC_BEGIN */
 +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
 +                                      {
 +                                              $7->relpersistence = RELPERSISTENCE_TEMP;
 +                                              n->islocal = true;
 +                                      }
 +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $16;
++                                      n->subcluster = $17;
 +                                      if (n->inhRelations != NULL && n->distributeby != NULL)
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                               errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
 +                                                               parser_errposition(exprLocation((Node *) n->distributeby))));
 +/* PGXC_END */
                                        $$ = (Node *)n;
                                }
                | CREATE OptTemp TABLE qualified_name OF any_name
-                       OptTypedTableElementList OptWith OnCommitOption OptTableSpace
+                       OptTypedTableElementList OptPartitionSpec OptWith OnCommitOption
+                       OptTableSpace
 +/* PGXC_BEGIN */
 +                      OptDistributeBy OptSubCluster
 +/* PGXC_END */
                                {
                                        CreateStmt *n = makeNode(CreateStmt);
                                        $4->relpersistence = $2;
                                        n->ofTypename = makeTypeNameFromNameList($6);
                                        n->ofTypename->location = @6;
                                        n->constraints = NIL;
-                                       n->options = $8;
-                                       n->oncommit = $9;
-                                       n->tablespacename = $10;
+                                       n->options = $9;
+                                       n->oncommit = $10;
+                                       n->tablespacename = $11;
                                        n->if_not_exists = false;
-                                       n->distributeby = $11;
-                                       n->subcluster = $12;
 +/* PGXC_BEGIN */
 +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
 +                                      {
 +                                              $4->relpersistence = RELPERSISTENCE_TEMP;
 +                                              n->islocal = true;
 +                                      }
 +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $12;
++                                      n->subcluster = $13;
 +                                      if (n->inhRelations != NULL && n->distributeby != NULL)
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                               errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
 +                                                               parser_errposition(exprLocation((Node *) n->distributeby))));
 +/* PGXC_END */
                                        $$ = (Node *)n;
                                }
                | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name OF any_name
-                       OptTypedTableElementList OptWith OnCommitOption OptTableSpace
+                       OptTypedTableElementList OptPartitionSpec OptWith OnCommitOption
+                       OptTableSpace
 +/* PGXC_BEGIN */
 +                      OptDistributeBy OptSubCluster
 +/* PGXC_END */
                                {
                                        CreateStmt *n = makeNode(CreateStmt);
                                        $7->relpersistence = $2;
                                        n->ofTypename = makeTypeNameFromNameList($9);
                                        n->ofTypename->location = @9;
                                        n->constraints = NIL;
-                                       n->options = $11;
-                                       n->oncommit = $12;
-                                       n->tablespacename = $13;
+                                       n->options = $12;
+                                       n->oncommit = $13;
+                                       n->tablespacename = $14;
                                        n->if_not_exists = true;
-                                       n->distributeby = $14;
-                                       n->subcluster = $15;
 +/* PGXC_BEGIN */
 +                                      if ($2 == RELPERSISTENCE_LOCAL_TEMP)
 +                                      {
 +                                              $7->relpersistence = RELPERSISTENCE_TEMP;
 +                                              n->islocal = true;
 +                                      }
 +                                      n->relkind = RELKIND_RELATION;
++                                      n->distributeby = $15;
++                                      n->subcluster = $16;
 +                                      if (n->inhRelations != NULL && n->distributeby != NULL)
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                               errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
 +                                                               parser_errposition(exprLocation((Node *) n->distributeby))));
 +/* PGXC_END */
                                        $$ = (Node *)n;
                                }
+               | CREATE OptTemp TABLE qualified_name PARTITION OF qualified_name
+                       OptTypedTableElementList ForValues OptPartitionSpec OptWith
+                       OnCommitOption OptTableSpace
+                               {
+                                       CreateStmt *n = makeNode(CreateStmt);
+                                       $4->relpersistence = $2;
+                                       n->relation = $4;
+                                       n->tableElts = $8;
+                                       n->inhRelations = list_make1($7);
+                                       n->partbound = $9;
+                                       n->partspec = $10;
+                                       n->ofTypename = NULL;
+                                       n->constraints = NIL;
+                                       n->options = $11;
+                                       n->oncommit = $12;
+                                       n->tablespacename = $13;
+                                       n->if_not_exists = false;
+                                       $$ = (Node *)n;
+                               }
+               | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name PARTITION OF
+                       qualified_name OptTypedTableElementList ForValues OptPartitionSpec
+                       OptWith OnCommitOption OptTableSpace
+                               {
+                                       CreateStmt *n = makeNode(CreateStmt);
+                                       $7->relpersistence = $2;
+                                       n->relation = $7;
+                                       n->tableElts = $11;
+                                       n->inhRelations = list_make1($10);
+                                       n->partbound = $12;
+                                       n->partspec = $13;
+                                       n->ofTypename = NULL;
+                                       n->constraints = NIL;
+                                       n->options = $14;
+                                       n->oncommit = $15;
+                                       n->tablespacename = $16;
+                                       n->if_not_exists = true;
+                                       $$ = (Node *)n;
+                               }
                ;
  
  /*
@@@ -14371,11 -14618,9 +15259,12 @@@ unreserved_keyword
                        | ASSERTION
                        | ASSIGNMENT
                        | AT
+                       | ATTACH
                        | ATTRIBUTE
                        | BACKWARD
 +/* PGXC_BEGIN */
 +                      | BARRIER
 +/* PGXC_END */
                        | BEFORE
                        | BEGIN_P
                        | BY
                        | CHARACTERISTICS
                        | CHECKPOINT
                        | CLASS
 +                      | CLEAN
                        | CLOSE
                        | CLUSTER
+                       | COLUMNS
                        | COMMENT
                        | COMMENTS
                        | COMMIT
                        | DELIMITER
                        | DELIMITERS
                        | DEPENDS
+                       | DETACH
                        | DICTIONARY
 +                      | DIRECT
                        | DISABLE_P
                        | DISCARD
 +/* PGXC_BEGIN */
 +                      | DISTKEY
 +                      | DISTRIBUTE
 +                      | DISTRIBUTED
 +                      | DISTSTYLE
 +/* PGXC_END */
                        | DOCUMENT_P
                        | DOMAIN_P
                        | DOUBLE_P
                        | MOVE
                        | NAME_P
                        | NAMES
+                       | NEW
                        | NEXT
                        | NO
 +                      | NODE
                        | NOTHING
                        | NOTIFY
                        | NOWAIT
                        | PROCEDURAL
                        | PROCEDURE
                        | PROGRAM
+                       | PUBLICATION
                        | QUOTE
 +/* PGXC_BEGIN */
 +                      | RANDOMLY
 +/* PGXC_END */
                        | RANGE
                        | READ
                        | REASSIGN
index 6876f2a3d4423712f9aa08cd44666a56bff4c6b2,efe1c371efc205f7b87c63fb1da07c2313cfdb96..9fc0371cb35822af016fbcdde49bf60942039ed2
@@@ -3,8 -3,7 +3,8 @@@
   * parse_agg.c
   *      handle aggregates and window functions in parser
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
Simple merge
Simple merge
index c10e272d72866b8906e1d0254dfd0b97c198729c,e412d0f9d30b8779594b9543bf194bee3472148d..8ae8b00236c35bce0c83b28cbab6e34d1a9b55bc
@@@ -3,8 -3,7 +3,8 @@@
   * parse_relation.c
   *      parser support routines dealing with relations
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "utils/lsyscache.h"
  #include "utils/rel.h"
  #include "utils/syscache.h"
 +#ifdef XCP
 +#include "utils/guc.h"
 +#include "catalog/pg_statistic.h"
 +#include "catalog/pg_namespace.h"
 +#include "pgxc/pgxc.h"
 +#include "miscadmin.h"
 +#endif
+ #include "utils/varlena.h"
  
  
  #define MAX_FUZZY_DISTANCE                            3
Simple merge
Simple merge
index bd0a6202859a7ce689512958602cc24282e3b8d1,9134fb9d63c1c5f06f2ff097e9c573889c44e690..c04e77775ed9f4ff408aa756071be20306cb8108
   * a quick copyObject() call before manipulating the query tree.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *    src/backend/parser/parse_utilcmd.c
   *
  #include "catalog/pg_opclass.h"
  #include "catalog/pg_operator.h"
  #include "catalog/pg_type.h"
 +#ifdef XCP
 +#include "catalog/pgxc_node.h"
 +#endif
  #include "commands/comment.h"
  #include "commands/defrem.h"
+ #include "commands/sequence.h"
  #include "commands/tablecmds.h"
  #include "commands/tablespace.h"
  #include "miscadmin.h"
@@@ -111,12 -90,8 +114,14 @@@ typedef struc
        List       *alist;                      /* "after list" of things to do after creating
                                                                 * the table */
        IndexStmt  *pkey;                       /* PRIMARY KEY index, if any */
 +#ifdef PGXC
 +      FallbackSrc fallback_source;
 +      List       *fallback_dist_cols;
 +      DistributeBy    *distributeby;          /* original distribute by column of CREATE TABLE */
 +      PGXCSubCluster  *subcluster;            /* original subcluster option of CREATE TABLE */
 +#endif
+       bool            ispartitioned;  /* true if table is partitioned */
+       PartitionBoundSpec *partbound;          /* transformed FOR VALUES */
  } CreateStmtContext;
  
  /* State shared by transformCreateSchemaStmt and its subroutines */
@@@ -162,13 -134,10 +167,17 @@@ static void transformConstraintAttrs(Cr
                                                 List *constraintList);
  static void transformColumnType(CreateStmtContext *cxt, ColumnDef *column);
  static void setSchemaName(char *context_schema, char **stmt_schema_name);
 +#ifdef PGXC
 +static void checkLocalFKConstraints(CreateStmtContext *cxt);
 +#endif
 +#ifdef XCP
 +static List *transformSubclusterNodes(PGXCSubCluster *subcluster);
 +static PGXCSubCluster *makeSubCluster(List *nodelist);
 +#endif
+ static void transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd);
+ static Const *transformPartitionBoundValue(ParseState *pstate, A_Const *con,
+                                                 const char *colName, Oid colType, int32 colTypmod);
  
  /*
   * transformCreateStmt -
@@@ -274,12 -237,7 +283,13 @@@ transformCreateStmt(CreateStmt *stmt, c
        cxt.blist = NIL;
        cxt.alist = NIL;
        cxt.pkey = NULL;
 +#ifdef PGXC
 +      cxt.fallback_source = FBS_NONE;
 +      cxt.fallback_dist_cols = NIL;
 +      cxt.distributeby = stmt->distributeby;
 +      cxt.subcluster = stmt->subcluster;
 +#endif
+       cxt.ispartitioned = stmt->partspec != NULL;
  
        /*
         * Notice that we allow OIDs here only for plain tables, even though
@@@ -551,86 -541,14 +682,15 @@@ transformColumnDefinition(CreateStmtCon
                char       *snamespace;
                char       *sname;
                char       *qstring;
 -              A_Const    *snamenode;
 +              A_Const    *snamenode;
                TypeCast   *castnode;
                FuncCall   *funccallnode;
-               CreateSeqStmt *seqstmt;
-               AlterSeqStmt *altseqstmt;
-               List       *attnamelist;
-               /*
-                * Determine namespace and name to use for the sequence.
-                *
-                * Although we use ChooseRelationName, it's not guaranteed that the
-                * selected sequence name won't conflict; given sufficiently long
-                * field names, two different serial columns in the same table could
-                * be assigned the same sequence name, and we'd not notice since we
-                * aren't creating the sequence quite yet.  In practice this seems
-                * quite unlikely to be a problem, especially since few people would
-                * need two serial columns in one table.
-                */
-               if (cxt->rel)
-                       snamespaceid = RelationGetNamespace(cxt->rel);
-               else
-               {
-                       snamespaceid = RangeVarGetCreationNamespace(cxt->relation);
-                       RangeVarAdjustRelationPersistence(cxt->relation, snamespaceid);
-               }
-               snamespace = get_namespace_name(snamespaceid);
-               sname = ChooseRelationName(cxt->relation->relname,
-                                                                  column->colname,
-                                                                  "seq",
-                                                                  snamespaceid);
-               ereport(DEBUG1,
-                               (errmsg("%s will create implicit sequence \"%s\" for serial column \"%s.%s\"",
-                                               cxt->stmtType, sname,
-                                               cxt->relation->relname, column->colname)));
-               /*
-                * Build a CREATE SEQUENCE command to create the sequence object, and
-                * add it to the list of things to be done before this CREATE/ALTER
-                * TABLE.
-                */
-               seqstmt = makeNode(CreateSeqStmt);
-               seqstmt->sequence = makeRangeVar(snamespace, sname, -1);
-               seqstmt->options = NIL;
- #ifdef PGXC
-               seqstmt->is_serial = true;
- #endif
-               /*
-                * If this is ALTER ADD COLUMN, make sure the sequence will be owned
-                * by the table's owner.  The current user might be someone else
-                * (perhaps a superuser, or someone who's only a member of the owning
-                * role), but the SEQUENCE OWNED BY mechanisms will bleat unless table
-                * and sequence have exactly the same owning role.
-                */
-               if (cxt->rel)
-                       seqstmt->ownerId = cxt->rel->rd_rel->relowner;
-               else
-                       seqstmt->ownerId = InvalidOid;
-               cxt->blist = lappend(cxt->blist, seqstmt);
-               /*
-                * Build an ALTER SEQUENCE ... OWNED BY command to mark the sequence
-                * as owned by this column, and add it to the list of things to be
-                * done after this CREATE/ALTER TABLE.
-                */
-               altseqstmt = makeNode(AlterSeqStmt);
-               altseqstmt->sequence = makeRangeVar(snamespace, sname, -1);
- #ifdef PGXC
-               altseqstmt->is_serial = true;
- #endif
-               attnamelist = list_make3(makeString(snamespace),
-                                                                makeString(cxt->relation->relname),
-                                                                makeString(column->colname));
-               altseqstmt->options = list_make1(makeDefElem("owned_by",
-                                                                                                        (Node *) attnamelist));
+               Constraint *constraint;
  
-               cxt->alist = lappend(cxt->alist, altseqstmt);
++              /* XXX XL 9.6 was setting stmt->is_serial. CHECK */
+               generateSerialExtraStmts(cxt, column,
+                                                                column->typeName->typeOid, NIL, false,
+                                                                &snamespace, &sname);
  
                /*
                 * Create appropriate constraints for SERIAL.  We do this in full,
@@@ -2824,12 -2670,8 +2973,14 @@@ transformAlterTableStmt(Oid relid, Alte
        cxt.blist = NIL;
        cxt.alist = NIL;
        cxt.pkey = NULL;
 +#ifdef PGXC
 +      cxt.fallback_source = FBS_NONE;
 +      cxt.fallback_dist_cols = NIL;
 +      cxt.distributeby = NULL;
 +      cxt.subcluster = NULL;
 +#endif
+       cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+       cxt.partbound = NULL;
  
        /*
         * The only subtypes that currently require parse transformation handling
@@@ -3299,495 -3239,274 +3564,767 @@@ setSchemaName(char *context_schema, cha
                                                *stmt_schema_name, context_schema)));
  }
  
 +#ifdef PGXC
 +/*
 + * CheckLocalIndexColumn
 + *
 + * Checks whether or not the index can be safely enforced locally
 + */
 +bool
 +CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname)
 +{
 +      if (IsLocatorReplicated(loctype))
 +              /* always safe */
 +              return true;
 +      if (loctype == LOCATOR_TYPE_RROBIN)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
 +                                      errmsg("Cannot locally enforce a unique index on round robin distributed table.")));
 +      else if (loctype == LOCATOR_TYPE_HASH || loctype == LOCATOR_TYPE_MODULO)
 +      {
 +              if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0)
 +                      return true;
 +      }
 +      return false;
 +}
 +
 +/*
 + * Given relation, find the index of the attribute in the primary key,
 + * which is the distribution key. Returns -1 if table is not a Hash/Modulo
 + * distributed, does not have a primary key or distribution key is not in the
 + * primary key (last should not happen).
 + */
 +static int
 +find_relation_pk_dist_index(Relation rel)
 +{
 +      int             result = -1;
 +      List       *indexoidlist;
 +      ListCell   *indexoidscan;
 +      int                     partAttNum = InvalidAttrNumber;
 +      bool            pk_found = false;
 +
 +      if (rel->rd_locator_info)
 +              partAttNum = rel->rd_locator_info->partAttrNum;
 +
 +      if (partAttNum == InvalidAttrNumber)
 +              return -1;
 +
 +      /*
 +       * Look up the primary key
 +       */
 +      indexoidlist = RelationGetIndexList(rel);
 +
 +      foreach(indexoidscan, indexoidlist)
 +      {
 +              Oid                     indexoid = lfirst_oid(indexoidscan);
 +              HeapTuple       indexTuple;
 +              Form_pg_index indexForm;
 +
 +              indexTuple = SearchSysCache1(INDEXRELID,
 +                                                               ObjectIdGetDatum(indexoid));
 +              if (!HeapTupleIsValid(indexTuple)) /* should not happen */
 +                      elog(ERROR, "cache lookup failed for index %u", indexoid);
 +              indexForm = ((Form_pg_index) GETSTRUCT(indexTuple));
 +              if (indexForm->indisprimary)
 +              {
 +                      int i;
 +
 +                      pk_found = true;
 +
 +                      /*
 +                       * Loop over index attributes to find
 +                       * the distribution key
 +                       */
 +                      for (i = 0; i < indexForm->indnatts; i++)
 +                      {
 +                              if (indexForm->indkey.values[i] == partAttNum)
 +                              {
 +                                      result = i;
 +                                      break;
 +                              }
 +                      }
 +              }
 +              ReleaseSysCache(indexTuple);
 +              if (pk_found)
 +                      break;
 +      }
 +
 +      list_free(indexoidlist);
 +
 +      return result;
 +}
 +
 +/*
 + * check to see if the constraint can be enforced locally
 + * if not, an error will be thrown
 + */
 +static void
 +checkLocalFKConstraints(CreateStmtContext *cxt)
 +{
 +      ListCell   *fkclist;
 +      List       *nodelist = NIL;
 +
 +      if (cxt->subcluster)
 +              nodelist = transformSubclusterNodes(cxt->subcluster);
 +
 +      foreach(fkclist, cxt->fkconstraints)
 +      {
 +              Constraint *constraint;
 +              Oid pk_rel_id;
 +              RelationLocInfo *rel_loc_info;
 +              constraint = (Constraint *) lfirst(fkclist);
 +
 +              /*
 +               * If constraint references to the table itself, it is safe
 +               * Check if relation name is the same
 +               * XCTODO: NO! It is only safe if table is replicated
 +               * or distributed on primary key
 +               */
 +              if (constraint->pktable &&
 +                      strcmp(constraint->pktable->relname,cxt->relation->relname) == 0)
 +              {
 +                      /* Is namespace also the same ? */
 +                      char *fkcon_schemaname = NULL;
 +
 +                      if (!cxt->relation->schemaname &&
 +                              !constraint->pktable->schemaname)
 +                              continue;
 +
 +                      if (!constraint->pktable->schemaname)
 +                      {
 +                              /* Schema name is not defined, look for current one */
 +                              List   *search_path = fetch_search_path(false);
 +                              fkcon_schemaname = get_namespace_name(linitial_oid(search_path));
 +                              list_free(search_path);
 +                      }
 +                      else
 +                              fkcon_schemaname = constraint->pktable->schemaname;
 +
 +                      /*
 +                       * If schema name and relation name are the same, table
 +                       * references to itself, so constraint is safe
 +                       */
 +                      if (fkcon_schemaname &&
 +                              strcmp(fkcon_schemaname,
 +                                         cxt->relation->schemaname) == 0)
 +                      {
 +                              /* check if bad distribution is already defined */
 +                              if ((cxt->distributeby && cxt->distributeby->disttype != DISTTYPE_REPLICATION) ||
 +                                              (cxt->isalter && cxt->rel->rd_locator_info != NULL && !IsLocatorReplicated(cxt->rel->rd_locator_info->locatorType)))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                       errmsg("only replicated table can reference itself")));
 +                              /* Record that replication is required */
 +                              cxt->fallback_source = FBS_REPLICATE;
 +                              if (cxt->fallback_dist_cols)
 +                              {
 +                                      list_free_deep(cxt->fallback_dist_cols);
 +                                      cxt->fallback_dist_cols = NULL;
 +                              }
 +                              continue;
 +                      }
 +              }
 +
 +              pk_rel_id = RangeVarGetRelid(constraint->pktable, NoLock, false);
 +              rel_loc_info = GetRelationLocInfo(pk_rel_id);
 +              /* If referenced table is replicated, the constraint is safe */
 +              if (rel_loc_info == NULL || IsLocatorReplicated(rel_loc_info->locatorType))
 +              {
 +                      List *common;
 +
 +                      if (cxt->subcluster)
 +                      {
 +                              /*
 +                               * Distribution nodes are defined, they must be a subset of
 +                               * the referenced relation's nodes
 +                               */
 +                              common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
 +                              if (list_length(common) < list_length(nodelist))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                       errmsg("referenced table is not defined on all target nodes")));
 +                              list_free(common);
 +                      }
 +                      else
 +                      {
 +                              /* suggest distribution */
 +                              if (nodelist)
 +                              {
 +                                      common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
 +                                      if (list_length(common) == 0)
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                               errmsg("referenced tables is defined on different nodes")));
 +                                      list_free(nodelist);
 +                                      nodelist = common;
 +                              }
 +                              else
 +                                      nodelist = rel_loc_info? list_copy(rel_loc_info->rl_nodeList):NIL;
 +                      }
 +              }
 +              else if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN)
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                       errmsg("Cannot reference a round robin table in a foreign key constraint")));
 +              }
 +              else if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
 +              {
 +                      ListCell   *fklc;
 +                      ListCell   *pklc;
 +                      char            ltype;
 +                      char       *lattr;
 +                      bool            found = false;
 +                      List       *common;
 +
 +                      /*
 +                       * First check nodes, they must be the same as in
 +                       * the referenced relation
 +                       */
 +                      if (cxt->subcluster)
 +                      {
 +                              common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
 +                              if (list_length(common) != list_length(rel_loc_info->rl_nodeList) ||
 +                                              list_length(common) != list_length(nodelist))
 +                              {
 +                                      if (list_length(common) == 0)
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                               errmsg("referenced HASH/MODULO table must be defined on same nodes")));
 +                              }
 +                              list_free(common);
 +                      }
 +                      else
 +                      {
 +                              if (nodelist)
 +                              {
 +                                      common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
 +                                      if (list_length(common) != list_length(rel_loc_info->rl_nodeList))
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                               errmsg("referenced HASH/MODULO table must be defined on same nodes")));
 +                                      list_free(nodelist);
 +                                      nodelist = common;
 +                              }
 +                              else
 +                                      nodelist = list_copy(rel_loc_info->rl_nodeList);
 +                              /* Now define the subcluster */
 +                              cxt->subcluster = makeSubCluster(nodelist);
 +                      }
 +
 +                      if (cxt->distributeby)
 +                      {
 +                              ltype = ConvertToLocatorType(cxt->distributeby->disttype);
 +                              lattr = cxt->distributeby->colname;
 +                      }
 +                      else if (cxt->isalter)
 +                      {
 +                              if (cxt->rel->rd_locator_info == NULL)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                       errmsg("Hash/Modulo distribution column does not refer"
 +                                                                      " to hash/modulo distribution column in referenced table.")));
 +                              ltype = cxt->rel->rd_locator_info->locatorType;
 +                              lattr = cxt->rel->rd_locator_info->partAttrName;
 +                      }
 +                      else
 +                      {
 +                              /*
 +                               * Not defined distribution, but we can define now.
 +                               * The distribution must be the same as in referenced table,
 +                               * distribution keys must be matching fk/pk
 +                               */
 +                              /*
 +                               * Can not define distribution by value already
 +                               */
 +                              if (cxt->fallback_source == FBS_REPLICATE)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                       errmsg("Hash/Modulo distribution column does not refer"
 +                                                                      " to hash/modulo distribution column in referenced table.")));
 +                              /* find the fk attribute matching the distribution column */
 +                              lattr = NULL;
 +                              if (list_length(constraint->pk_attrs) == 0)
 +                              {
 +                                      /*
 +                                       * PK attribute list may be missing, so FK must reference
 +                                       * the primary table's primary key. The primary key may
 +                                       * consist of multiple attributes, one of them is a
 +                                       * distribution key. We should find the foreign attribute
 +                                       * referencing that primary attribute and set it as the
 +                                       * distribution key of the table.
 +                                       */
 +                                      int             pk_attr_idx;
 +                                      Relation        rel;
 +
 +                                      rel = relation_open(pk_rel_id, AccessShareLock);
 +                                      pk_attr_idx = find_relation_pk_dist_index(rel);
 +                                      relation_close(rel, AccessShareLock);
 +
 +                                      if (pk_attr_idx >= 0 &&
 +                                                      pk_attr_idx < list_length(constraint->fk_attrs))
 +                                      {
 +                                              lattr = strVal(list_nth(constraint->fk_attrs, pk_attr_idx));
 +                                      }
 +                              }
 +                              else
 +                              {
 +                                      /*
 +                                       * One of the primary attributes must be the primary
 +                                       * tabble's distribution key. We should find the foreign
 +                                       * attribute referencing that primary attribute and set it
 +                                       * as the distribution key of the table.
 +                                       */
 +                                      forboth(fklc, constraint->fk_attrs,
 +                                                      pklc, constraint->pk_attrs)
 +                                      {
 +                                              if (strcmp(rel_loc_info->partAttrName,
 +                                                                 strVal(lfirst(pklc))) == 0)
 +                                              {
 +                                                      lattr = strVal(lfirst(fklc));
 +                                                      break;
 +                                              }
 +                                      }
 +                              }
 +                              /* distribution column is not referenced? */
 +                              if (lattr == NULL)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                       errmsg("Hash/Modulo distribution column does not refer"
 +                                                                      " to hash/modulo distribution column in referenced table.")));
 +                              foreach(fklc, cxt->fallback_dist_cols)
 +                              {
 +                                      if (strcmp(lattr, (char *) lfirst(fklc)) == 0)
 +                                      {
 +                                              found = true;
 +                                              break;
 +                                      }
 +                              }
 +                              if (found)
 +                              {
 +                                      list_free_deep(cxt->fallback_dist_cols);
 +                                      cxt->fallback_dist_cols = NIL;
 +                                      cxt->fallback_source = FBS_NONE;
 +                                      cxt->distributeby = makeNode(DistributeBy);
 +                                      switch (rel_loc_info->locatorType)
 +                                      {
 +                                              case LOCATOR_TYPE_HASH:
 +                                                      cxt->distributeby->disttype = DISTTYPE_HASH;
 +                                                      cxt->distributeby->colname = pstrdup(lattr);
 +                                                      break;
 +                                              case LOCATOR_TYPE_MODULO:
 +                                                      cxt->distributeby->disttype = DISTTYPE_MODULO;
 +                                                      cxt->distributeby->colname = pstrdup(lattr);
 +                                                      break;
 +                                              default:
 +                                                      /* can not happen ?*/
 +                                                      ereport(ERROR,
 +                                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                                       errmsg("Hash/Modulo distribution column does not refer"
 +                                                                                      " to hash/modulo distribution column in referenced table.")));
 +                                      }
 +                              }
 +                              else /* dist attr is not found */
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                       errmsg("Hash/Modulo distribution column does not refer"
 +                                                                      " to hash/modulo distribution column in referenced table.")));
 +                              continue;
 +                      }
 +                      /*
 +                       * Here determine if already defined distribution is matching
 +                       * to distribution of primary table.
 +                       */
 +                      if (ltype != rel_loc_info->locatorType || lattr == NULL)
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                               errmsg("Hash/Modulo distribution column does not refer"
 +                                                              " to hash/modulo distribution column in referenced table.")));
 +                      if (list_length(constraint->pk_attrs) == 0)
 +                      {
 +                              /*
 +                               * PK attribute list may be missing, so FK must reference
 +                               * the primary table's primary key. The primary key may
 +                               * consist of multiple attributes, one of them is a
 +                               * distribution key. We should find the foreign attribute
 +                               * referencing that primary attribute and make sure it is a
 +                               * distribution key of the table.
 +                               */
 +                              int             pk_attr_idx;
 +                              Relation        rel;
 +
 +                              rel = relation_open(pk_rel_id, AccessShareLock);
 +                              pk_attr_idx = find_relation_pk_dist_index(rel);
 +                              relation_close(rel, AccessShareLock);
 +
 +                              /*
 +                               * Two first conditions are just avoid assertion failure in
 +                               * list_nth. First should never happen, because the primary key
 +                               * of hash/modulo distributed table must contain distribution
 +                               * key. Second may only happen if list of foreign columns is
 +                               * shorter then the primary key. In that case statement would
 +                               * probably fail later, but no harm if it fails here.
 +                               */
 +                              if (pk_attr_idx >= 0 &&
 +                                              pk_attr_idx < list_length(constraint->fk_attrs) &&
 +                                              strcmp(lattr, strVal(list_nth(constraint->fk_attrs,
 +                                                                                                        pk_attr_idx))) == 0)
 +                              {
 +                                      found = true;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              forboth(fklc, constraint->fk_attrs, pklc, constraint->pk_attrs)
 +                              {
 +                                      if (strcmp(lattr, strVal(lfirst(fklc))) == 0)
 +                                      {
 +                                              found = true;
 +                                              if (strcmp(rel_loc_info->partAttrName,
 +                                                                 strVal(lfirst(pklc))) == 0)
 +                                                      break;
 +                                              else
 +                                                      ereport(ERROR,
 +                                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                                       errmsg("Hash/Modulo distribution column does not refer"
 +                                                                                      " to hash/modulo distribution column in referenced table.")));
 +                                      }
 +                              }
 +                      }
 +                      if (!found)
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                                               errmsg("Hash/Modulo distribution column does not refer"
 +                                                              " to hash/modulo distribution column in referenced table.")));
 +              }
 +              else /* Unsupported distribution */
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                       errmsg("Cannot reference a table with distribution type \"%c\"",
 +                                       rel_loc_info->locatorType)));
 +              }
 +      }
 +      /*
 +       * If presence of a foreign constraint suggested a set of nodes, fix it here
 +       */
 +      if (nodelist && cxt->subcluster == NULL)
 +              cxt->subcluster = makeSubCluster(nodelist);
 +}
 +#endif
 +
 +
 +#ifdef XCP
 +/*
 + * Convert SubCluster definition to a list of Datanode indexes, to compare to
 + * relation nodes
 + */
 +static List *
 +transformSubclusterNodes(PGXCSubCluster *subcluster)
 +{
 +      List   *result = NIL;
 +      Oid        *nodeoids;
 +      int             numnodes;
 +      int     i;
 +      char    nodetype = PGXC_NODE_DATANODE;
 +
 +      nodeoids = GetRelationDistributionNodes(subcluster, &numnodes);
 +      for (i = 0; i < numnodes; i++)
 +              result = lappend_int(result, PGXCNodeGetNodeId(nodeoids[i], &nodetype));
 +
 +      return result;
 +}
 +
 +
 +/*
 + * Create a SubCluster definition from a list of node indexes.
 + */
 +static PGXCSubCluster *
 +makeSubCluster(List *nodelist)
 +{
 +      PGXCSubCluster *result;
 +      ListCell           *lc;
 +      result = makeNode(PGXCSubCluster);
 +      result->clustertype = SUBCLUSTER_NODE;
 +      foreach (lc, nodelist)
 +      {
 +              int     nodeidx = lfirst_int(lc);
 +              char   *nodename = get_pgxc_nodename(
 +                                                      PGXCNodeGetNodeOid(nodeidx, PGXC_NODE_DATANODE));
 +              result->members = lappend(result->members, makeString(nodename));
 +      }
 +      return result;
 +}
 +#endif
++
+ /*
+  * transformPartitionCmd
+  *            Analyze the ATTACH/DETACH PARTITION command
+  *
+  * In case of the ATTACH PARTITION command, cxt->partbound is set to the
+  * transformed value of cmd->bound.
+  */
+ static void
+ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd)
+ {
+       Relation        parentRel = cxt->rel;
+       /* the table must be partitioned */
+       if (parentRel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                errmsg("\"%s\" is not partitioned",
+                                               RelationGetRelationName(parentRel))));
+       /* transform the partition bound, if any */
+       Assert(RelationGetPartitionKey(parentRel) != NULL);
+       if (cmd->bound != NULL)
+               cxt->partbound = transformPartitionBound(cxt->pstate, parentRel,
+                                                                                                cmd->bound);
+ }
+ /*
+  * transformPartitionBound
+  *
+  * Transform a partition bound specification
+  */
+ PartitionBoundSpec *
+ transformPartitionBound(ParseState *pstate, Relation parent,
+                                               PartitionBoundSpec *spec)
+ {
+       PartitionBoundSpec *result_spec;
+       PartitionKey key = RelationGetPartitionKey(parent);
+       char            strategy = get_partition_strategy(key);
+       int                     partnatts = get_partition_natts(key);
+       List       *partexprs = get_partition_exprs(key);
+       /* Avoid scribbling on input */
+       result_spec = copyObject(spec);
+       if (strategy == PARTITION_STRATEGY_LIST)
+       {
+               ListCell   *cell;
+               char       *colname;
+               Oid                     coltype;
+               int32           coltypmod;
+               if (spec->strategy != PARTITION_STRATEGY_LIST)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                 errmsg("invalid bound specification for a list partition"),
+                                  parser_errposition(pstate, exprLocation((Node *) spec))));
+               /* Get the only column's name in case we need to output an error */
+               if (key->partattrs[0] != 0)
+                       colname = get_relid_attribute_name(RelationGetRelid(parent),
+                                                                                          key->partattrs[0]);
+               else
+                       colname = deparse_expression((Node *) linitial(partexprs),
+                                                deparse_context_for(RelationGetRelationName(parent),
+                                                                                        RelationGetRelid(parent)),
+                                                                                false, false);
+               /* Need its type data too */
+               coltype = get_partition_col_typid(key, 0);
+               coltypmod = get_partition_col_typmod(key, 0);
+               result_spec->listdatums = NIL;
+               foreach(cell, spec->listdatums)
+               {
+                       A_Const    *con = castNode(A_Const, lfirst(cell));
+                       Const      *value;
+                       ListCell   *cell2;
+                       bool            duplicate;
+                       value = transformPartitionBoundValue(pstate, con,
+                                                                                                colname, coltype, coltypmod);
+                       /* Don't add to the result if the value is a duplicate */
+                       duplicate = false;
+                       foreach(cell2, result_spec->listdatums)
+                       {
+                               Const      *value2 = castNode(Const, lfirst(cell2));
+                               if (equal(value, value2))
+                               {
+                                       duplicate = true;
+                                       break;
+                               }
+                       }
+                       if (duplicate)
+                               continue;
+                       result_spec->listdatums = lappend(result_spec->listdatums,
+                                                                                         value);
+               }
+       }
+       else if (strategy == PARTITION_STRATEGY_RANGE)
+       {
+               ListCell   *cell1,
+                                  *cell2;
+               int                     i,
+                                       j;
+               bool            seen_unbounded;
+               if (spec->strategy != PARTITION_STRATEGY_RANGE)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                errmsg("invalid bound specification for a range partition"),
+                                  parser_errposition(pstate, exprLocation((Node *) spec))));
+               if (list_length(spec->lowerdatums) != partnatts)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                        errmsg("FROM must specify exactly one value per partitioning column")));
+               if (list_length(spec->upperdatums) != partnatts)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+                                        errmsg("TO must specify exactly one value per partitioning column")));
+               /*
+                * Check that no finite value follows an UNBOUNDED item in either of
+                * lower and upper bound lists.
+                */
+               seen_unbounded = false;
+               foreach(cell1, spec->lowerdatums)
+               {
+                       PartitionRangeDatum *ldatum = castNode(PartitionRangeDatum,
+                                                                                                  lfirst(cell1));
+                       if (ldatum->infinite)
+                               seen_unbounded = true;
+                       else if (seen_unbounded)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                          errmsg("cannot specify finite value after UNBOUNDED"),
+                                parser_errposition(pstate, exprLocation((Node *) ldatum))));
+               }
+               seen_unbounded = false;
+               foreach(cell1, spec->upperdatums)
+               {
+                       PartitionRangeDatum *rdatum = castNode(PartitionRangeDatum,
+                                                                                                  lfirst(cell1));
+                       if (rdatum->infinite)
+                               seen_unbounded = true;
+                       else if (seen_unbounded)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+                                          errmsg("cannot specify finite value after UNBOUNDED"),
+                                parser_errposition(pstate, exprLocation((Node *) rdatum))));
+               }
+               /* Transform all the constants */
+               i = j = 0;
+               result_spec->lowerdatums = result_spec->upperdatums = NIL;
+               forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
+               {
+                       PartitionRangeDatum *ldatum = (PartitionRangeDatum *) lfirst(cell1);
+                       PartitionRangeDatum *rdatum = (PartitionRangeDatum *) lfirst(cell2);
+                       char       *colname;
+                       Oid                     coltype;
+                       int32           coltypmod;
+                       A_Const    *con;
+                       Const      *value;
+                       /* Get the column's name in case we need to output an error */
+                       if (key->partattrs[i] != 0)
+                               colname = get_relid_attribute_name(RelationGetRelid(parent),
+                                                                                                  key->partattrs[i]);
+                       else
+                       {
+                               colname = deparse_expression((Node *) list_nth(partexprs, j),
+                                                deparse_context_for(RelationGetRelationName(parent),
+                                                                                        RelationGetRelid(parent)),
+                                                                                        false, false);
+                               ++j;
+                       }
+                       /* Need its type data too */
+                       coltype = get_partition_col_typid(key, i);
+                       coltypmod = get_partition_col_typmod(key, i);
+                       if (ldatum->value)
+                       {
+                               con = castNode(A_Const, ldatum->value);
+                               value = transformPartitionBoundValue(pstate, con,
+                                                                                                        colname,
+                                                                                                        coltype, coltypmod);
+                               if (value->constisnull)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("cannot specify NULL in range bound")));
+                               ldatum = copyObject(ldatum);    /* don't scribble on input */
+                               ldatum->value = (Node *) value;
+                       }
+                       if (rdatum->value)
+                       {
+                               con = castNode(A_Const, rdatum->value);
+                               value = transformPartitionBoundValue(pstate, con,
+                                                                                                        colname,
+                                                                                                        coltype, coltypmod);
+                               if (value->constisnull)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                                                        errmsg("cannot specify NULL in range bound")));
+                               rdatum = copyObject(rdatum);    /* don't scribble on input */
+                               rdatum->value = (Node *) value;
+                       }
+                       result_spec->lowerdatums = lappend(result_spec->lowerdatums,
+                                                                                          ldatum);
+                       result_spec->upperdatums = lappend(result_spec->upperdatums,
+                                                                                          rdatum);
+                       ++i;
+               }
+       }
+       else
+               elog(ERROR, "unexpected partition strategy: %d", (int) strategy);
+       return result_spec;
+ }
+ /*
+  * Transform one constant in a partition bound spec
+  */
+ static Const *
+ transformPartitionBoundValue(ParseState *pstate, A_Const *con,
+                                                  const char *colName, Oid colType, int32 colTypmod)
+ {
+       Node       *value;
+       /* Make it into a Const */
+       value = (Node *) make_const(pstate, &con->val, con->location);
+       /* Coerce to correct type */
+       value = coerce_to_target_type(pstate,
+                                                                 value, exprType(value),
+                                                                 colType,
+                                                                 colTypmod,
+                                                                 COERCION_ASSIGNMENT,
+                                                                 COERCE_IMPLICIT_CAST,
+                                                                 -1);
+       if (value == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+               errmsg("specified value cannot be cast to type %s for column \"%s\"",
+                          format_type_be(colType), colName),
+                                parser_errposition(pstate, con->location)));
+       /* Simplify the expression, in case we had a coercion */
+       if (!IsA(value, Const))
+               value = (Node *) expression_planner((Expr *) value);
+       /* Fail if we don't have a constant (i.e., non-immutable coercion) */
+       if (!IsA(value, Const))
+               ereport(ERROR,
+                               (errcode(ERRCODE_DATATYPE_MISMATCH),
+               errmsg("specified value cannot be cast to type %s for column \"%s\"",
+                          format_type_be(colType), colName),
+                                errdetail("The cast requires a non-immutable conversion."),
+                                errhint("Try putting the literal value in single quotes."),
+                                parser_errposition(pstate, con->location)));
+       return (Const *) value;
+ }
index 2cc9b54dd5612f237be6bb379fe5a4695d4b756a,245b4cda3b9b3a4ebe585cad639f5c1fe3b62d02..522d7ec2035bdebd86a4768ff19530780b679c2c
   * raw_parser
   *            Given a query in string form, do lexical and grammatical analysis.
   *
-  * Returns a list of raw (un-analyzed) parse trees.
+  * Returns a list of raw (un-analyzed) parse trees.  The immediate elements
+  * of the list are always RawStmt nodes.
   */
  List *
 -raw_parser(const char *str)
 +raw_parser(const char *str, List **queries)
  {
        core_yyscan_t yyscanner;
        base_yy_extra_type yyextra;
Simple merge
index 164dafa0e8d00677a4663fdf6933a2675b565d06,0000000000000000000000000000000000000000..65769e94c3af5245d0502953e882c07efb5e85e5
mode 100644,000000..100644
--- /dev/null
@@@ -1,480 -1,0 +1,481 @@@
 +/*-------------------------------------------------------------------------
 + *
 + * pause.c
 + *
 + *     Cluster Pause/Unpause handling
 + *
 + * IDENTIFICATION
 + *      $$
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#ifdef XCP
 +#include "postgres.h"
 +#include "pgxc/execRemote.h"
 +#include "pgxc/pause.h"
 +#include "pgxc/pgxc.h"
++#include "storage/shmem.h"
 +#include "storage/spin.h"
 +#include "miscadmin.h"
 +
 +/* globals */
 +bool cluster_lock_held;
 +bool cluster_ex_lock_held;
 +
 +static void HandleClusterPause(bool pause, bool initiator);
 +static void ProcessClusterPauseRequest(bool pause);
 +
 +ClusterLockInfo *ClustLinfo = NULL;
 +
 +/*
 + * ProcessClusterPauseRequest:
 + *
 + * Carry out PAUSE/UNPAUSE request on a coordinator node
 + */
 +static void
 +ProcessClusterPauseRequest(bool pause)
 +{
 +      char *action = pause? "PAUSE":"UNPAUSE";
 +
 +      if (!IS_PGXC_COORDINATOR || !IsConnFromCoord())
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("The %s CLUSTER message is expected to "
 +                                              "arrive at a coordinator from another coordinator",
 +                                              action)));
 +
 +      elog(DEBUG2, "Received %s CLUSTER from a coordinator", action);
 +
 +      /*
 +       * If calling UNPAUSE, ensure that the cluster lock has already been held
 +       * in exclusive mode
 +       */
 +      if (!pause && !cluster_ex_lock_held)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Received an UNPAUSE request when cluster not PAUSED!")));
 +
 +      /*
 +       * Enable/Disable local queries. We need to release the lock first
 +       *
 +       * TODO: Think of some timeout mechanism here, if the locking takes too
 +       * much time...
 +       */
 +      ReleaseClusterLock(pause? false:true);
 +      AcquireClusterLock(pause? true:false);
 +
 +      if (pause)
 +              cluster_ex_lock_held = true;
 +      else
 +              cluster_ex_lock_held = false;
 +
 +      elog(DEBUG2, "%s queries at the coordinator", pause? "Paused":"Resumed");
 +
 +      return;
 +}
 +
 +/*
 + * HandleClusterPause:
 + *
 + * Any errors will be reported via ereport.
 + */
 +static void
 +HandleClusterPause(bool pause, bool initiator)
 +{
 +      PGXCNodeAllHandles *coord_handles;
 +      int conn;
 +      int response;
 +      char *action = pause? "PAUSE":"UNPAUSE";
 +
 +      elog(DEBUG2, "Preparing coordinators for %s CLUSTER", action);
 +
 +      if (pause && cluster_ex_lock_held)
 +      {
 +              ereport(NOTICE, (errmsg("CLUSTER already PAUSED")));
 +
 +              /* Nothing to do */
 +              return;
 +      }
 +
 +      if (!pause && !cluster_ex_lock_held)
 +      {
 +              ereport(NOTICE, (errmsg("Issue PAUSE CLUSTER before calling UNPAUSE")));
 +
 +              /* Nothing to do */
 +              return;
 +      }
 +
 +      /*
 +       * If we are one of the participating coordinators, just do the action
 +       * locally and return
 +       */
 +      if (!initiator)
 +      {
 +              ProcessClusterPauseRequest(pause);
 +              return;
 +      }
 +
 +      /*
 +       * Send a PAUSE/UNPAUSE CLUSTER message to all the coordinators. We should send an
 +       * asyncronous request, update the local ClusterLock and then wait for the remote
 +       * coordinators to respond back
 +       */
 +
 +      coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
 +
 +      for (conn = 0; conn < coord_handles->co_conn_count; conn++)
 +      {
 +              PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
 +
 +              if (pgxc_node_send_query(handle, pause? "PAUSE CLUSTER" : "UNPAUSE CLUSTER") != 0)
 +                      ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send %s CLUSTER request to some coordinator nodes",action)));
 +      }
 +
 +      /*
 +       * Disable/Enable local queries. We need to release the SHARED mode first
 +       *
 +       * TODO: Start a timer to cancel the request in case of a timeout
 +       */
 +      ReleaseClusterLock(pause? false:true);
 +      AcquireClusterLock(pause? true:false);
 +
 +      if (pause)
 +              cluster_ex_lock_held = true;
 +      else
 +              cluster_ex_lock_held = false;
 +
 +
 +      elog(DEBUG2, "%s queries at the driving coordinator", pause? "Paused":"Resumed");
 +
 +      /*
 +       * Local queries are paused/enabled. Check status of the remote coordinators
 +       * now. We need a TRY/CATCH block here, so that if one of the coordinator
 +       * fails for some reason, we can try best-effort to salvage the situation
 +       * at others
 +       *
 +       * We hope that errors in the earlier loop generally do not occur (out of
 +       * memory and improper handles..) or we can have a similar TRY/CATCH block
 +       * there too
 +       *
 +       * To repeat: All the salvaging is best effort really...
 +       */
 +      PG_TRY();
 +      {
 +              ResponseCombiner combiner;
 +
 +              InitResponseCombiner(&combiner, coord_handles->co_conn_count, COMBINE_TYPE_NONE);
 +              for (conn = 0; conn < coord_handles->co_conn_count; conn++)
 +              {
 +                      PGXCNodeHandle *handle;
 +
 +                      handle = coord_handles->coord_handles[conn];
 +
 +                      while (true)
 +                      {
 +                              if (pgxc_node_receive(1, &handle, NULL))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to receive a response from the remote coordinator node")));
 +
 +                              response = handle_response(handle, &combiner);
 +                              if (response == RESPONSE_EOF)
 +                                      continue;
 +                              else if (response == RESPONSE_COMPLETE)
 +                                      break;
 +                              else
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("%s CLUSTER command failed "
 +                                                                      "with error %s", action, handle->error)));
 +                      }
 +              }
 +
 +              if (combiner.errorMessage)
 +              {
 +                      char *code = combiner.errorCode;
 +                      if (combiner.errorDetail != NULL)
 +                              ereport(ERROR,
 +                                              (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
 +                                               errmsg("%s", combiner.errorMessage), errdetail("%s", combiner.errorDetail) ));
 +                      else
 +                              ereport(ERROR,
 +                                              (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
 +                                               errmsg("%s", combiner.errorMessage)));
 +              }
 +
 +              CloseCombiner(&combiner);
 +      }
 +      PG_CATCH();
 +      {
 +              /*
 +               * If PAUSE CLUSTER, issue UNPAUSE on the reachable nodes. For failure
 +               * in cases of UNPAUSE, might need manual intervention at the offending
 +               * coordinator node (maybe do a pg_cancel_backend() on the backend
 +               * that's holding the exclusive lock or something..)
 +               */
 +              if (!pause)
 +                      ereport(WARNING,
 +                               (errmsg("UNPAUSE CLUSTER command failed on one or more coordinator nodes."
 +                                              " Manual intervention may be required!")));
 +              else
 +                      ereport(WARNING,
 +                               (errmsg("PAUSE CLUSTER command failed on one or more coordinator nodes."
 +                                              " Trying to UNPAUSE reachable nodes now")));
 +
 +              for (conn = 0; conn < coord_handles->co_conn_count && pause; conn++)
 +              {
 +                      PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
 +
 +                      (void) pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
 +
 +                      /*
 +                       * The incoming data should hopefully be discarded as part of
 +                       * cleanup..
 +                       */
 +              }
 +
 +              /* cleanup locally.. */
 +              ReleaseClusterLock(pause? true:false);
 +              AcquireClusterLock(pause? false:true);
 +              cluster_ex_lock_held = false;
 +              PG_RE_THROW();
 +      }
 +      PG_END_TRY();
 +
 +      elog(DEBUG2, "Successfully completed %s CLUSTER command on "
 +                               "all coordinator nodes", action);
 +
 +      return;
 +}
 +
 +void
 +RequestClusterPause(bool pause, char *completionTag)
 +{
 +      char    *action = pause? "PAUSE":"UNPAUSE";
 +      bool     initiator = true;
 +
 +      elog(DEBUG2, "%s CLUSTER request received", action);
 +
 +      /* Only a superuser can perform this activity on a cluster */
 +      if (!superuser())
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 +                               errmsg("%s CLUSTER command: must be a superuser", action)));
 +
 +      /* Ensure that we are a coordinator */
 +      if (!IS_PGXC_COORDINATOR)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("%s CLUSTER command must be sent to a coordinator", action)));
 +
 +      /*
 +       * Did the command come directly to this coordinator or via another
 +       * coordinator?
 +       */
 +      if (IsConnFromCoord())
 +              initiator = false;
 +
 +      HandleClusterPause(pause, initiator);
 +
 +      if (completionTag)
 +              snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "%s CLUSTER", action);
 +}
 +
 +/*
 + * If the backend is shutting down, cleanup the PAUSE cluster lock
 + * appropriately. We do this before shutting down shmem, because this needs
 + * LWLock and stuff
 + */
 +void
 +PGXCCleanClusterLock(int code, Datum arg)
 +{
 +      PGXCNodeAllHandles *coord_handles;
 +      int conn;
 +
 +      if (cluster_lock_held && !cluster_ex_lock_held)
 +      {
 +              ReleaseClusterLock (false);
 +              cluster_lock_held = false;
 +      }
 +
 +      /* Do nothing if cluster lock not held */
 +      if (!cluster_ex_lock_held)
 +              return;
 +
 +      /* Do nothing if we are not the initiator */
 +      if (IsConnFromCoord())
 +              return;
 +
 +      coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
 +      /* Try best-effort to UNPAUSE other coordinators now */
 +      for (conn = 0; conn < coord_handles->co_conn_count; conn++)
 +      {
 +              PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
 +
 +              /* No error checking here... */
 +              (void)pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
 +      }
 +
 +      /* Release locally too. We do not want a dangling value in cl_holder_pid! */
 +      ReleaseClusterLock(true);
 +      cluster_ex_lock_held = false;
 +}
 +
 +/* Report shared memory space needed by ClusterLockShmemInit */
 +Size
 +ClusterLockShmemSize(void)
 +{
 +      Size            size = 0;
 +
 +      size = add_size(size, sizeof(ClusterLockInfo));
 +
 +      return size;
 +}
 +
 +/* Allocate and initialize cluster locking related shared memory */
 +void
 +ClusterLockShmemInit(void)
 +{
 +      bool            found;
 +
 +      ClustLinfo = (ClusterLockInfo *)
 +              ShmemInitStruct("Cluster Lock Info", ClusterLockShmemSize(), &found);
 +
 +      if (!found)
 +      {
 +              /* First time through, so initialize */
 +              MemSet(ClustLinfo, 0, ClusterLockShmemSize());
 +              SpinLockInit(&ClustLinfo->cl_mutex);
 +      }
 +}
 +
 +/*
 + * AcquireClusterLock
 + *
 + *  Based on the argument passed in, try to update the shared memory
 + *  appropriately. In case the conditions cannot be satisfied immediately this
 + *  function resorts to a simple sleep. We don't envision PAUSE CLUSTER to
 + *  occur that frequently so most of the calls will come out immediately here
 + *  without any sleeps at all
 + *
 + *  We could have used a semaphore to allow the processes to sleep while the
 + *  cluster lock is held. But again we are really not worried about performance
 + *  and immediate wakeups around PAUSE CLUSTER functionality. Using the sleep
 + *  in an infinite loop keeps things simple yet correct
 + */
 +void
 +AcquireClusterLock(bool exclusive)
 +{
 +      volatile ClusterLockInfo *clinfo = ClustLinfo;
 +
 +      if (exclusive && cluster_ex_lock_held)
 +      {
 +              return;
 +      }
 +
 +      /*
 +       * In the normal case, none of the backends will ask for exclusive lock, so
 +       * they will just update the cl_process_count value and exit immediately
 +       * from the below loop
 +       */
 +      for (;;)
 +      {
 +              bool wait = false;
 +
 +              SpinLockAcquire(&clinfo->cl_mutex);
 +
 +              if (!exclusive)
 +              {
 +                      if (clinfo->cl_holder_pid == 0)
 +                              clinfo->cl_process_count++;
 +                      else
 +                              wait = true;
 +              }
 +              else /* PAUSE CLUSTER handling */
 +              {
 +                      if (clinfo->cl_holder_pid != 0)
 +                      {
 +                              SpinLockRelease(&clinfo->cl_mutex);
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("PAUSE CLUSTER already in progress")));
 +                      }
 +
 +                      /*
 +                       * There should be no other process
 +                       * holding the lock including ourself
 +                       */
 +                      if (clinfo->cl_process_count  > 0)
 +                              wait = true;
 +                      else
 +                              clinfo->cl_holder_pid = MyProcPid;
 +              }
 +              SpinLockRelease(&clinfo->cl_mutex);
 +
 +              /*
 +               * We use a simple sleep mechanism. If PAUSE CLUSTER has been invoked,
 +               * we are not worried about immediate performance characteristics..
 +               */
 +              if (wait)
 +              {
 +                      CHECK_FOR_INTERRUPTS();
 +                      pg_usleep(100000L);
 +              }
 +              else /* Got the proper semantic read/write lock.. */
 +                      break;
 +      }
 +}
 +
 +/*
 + * ReleaseClusterLock
 + *
 + *            Update the shared memory appropriately across the release call. We
 + *            really do not need the bool argument, but it's there for some
 + *            additional sanity checking
 + */
 +void
 +ReleaseClusterLock(bool exclusive)
 +{
 +      volatile ClusterLockInfo *clinfo = ClustLinfo;
 +
 +      SpinLockAcquire(&clinfo->cl_mutex);
 +      if (exclusive)
 +      {
 +              if (clinfo->cl_process_count > 1 ||
 +                              clinfo->cl_holder_pid == 0)
 +              {
 +                      SpinLockRelease(&clinfo->cl_mutex);
 +                      ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Inconsistent state while doing UNPAUSE CLUSTER")));
 +              }
 +
 +              /*
 +               * Reset the holder pid. Any waiters in AcquireClusterLock will
 +               * eventually come out of their sleep and notice this new value and
 +               * move ahead
 +               */
 +              clinfo->cl_holder_pid = 0;
 +      }
 +      else
 +      {
 +              if (clinfo->cl_holder_pid != 0)
 +              {
 +                      SpinLockRelease(&clinfo->cl_mutex);
 +                      ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Inconsistent state while releasing CLUSTER lock")));
 +              }
 +              /*
 +               * Decrement our count. If a PAUSE is waiting inside AcquireClusterLock
 +               * elsewhere, it will wake out of sleep and do the needful
 +               */
 +              if (clinfo->cl_process_count > 0)
 +                      clinfo->cl_process_count--;
 +      }
 +      SpinLockRelease(&clinfo->cl_mutex);
 +}
 +#endif
index c45d7e7d146500cc76f1a71d000e973afdca479a,0000000000000000000000000000000000000000..1c6d98c8a24ac644cb2f41fa6cf8724eda62b877
mode 100644,000000..100644
--- /dev/null
@@@ -1,1834 -1,0 +1,1832 @@@
-               case INT2VECTOROID:
-                       return hashint2vector;
 +/*-------------------------------------------------------------------------
 + *
 + * locator.c
 + *            Functions that help manage table location information such as
 + * partitioning and replication information.
 + *
 + *
 + *
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
 + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + *
 + * IDENTIFICATION
 + *            $$
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include <stdlib.h>
 +#include <string.h>
 +#include <stdio.h>
 +#include <time.h>
 +
 +#include "postgres.h"
 +#include "access/skey.h"
 +#include "access/gtm.h"
 +#include "access/relscan.h"
 +#include "catalog/indexing.h"
 +#include "catalog/pg_type.h"
 +#include "nodes/pg_list.h"
 +#include "nodes/nodeFuncs.h"
 +#include "utils/builtins.h"
 +#include "utils/catcache.h"
 +#include "utils/fmgroids.h"
 +#include "utils/lsyscache.h"
 +#include "utils/rel.h"
 +#include "utils/relcache.h"
 +#include "utils/tqual.h"
 +#include "utils/syscache.h"
 +#include "nodes/nodes.h"
 +#include "optimizer/clauses.h"
 +#include "parser/parse_coerce.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/pgxc.h"
 +#include "pgxc/pgxcnode.h"
 +
 +#include "catalog/pgxc_class.h"
 +#include "catalog/pgxc_node.h"
 +#include "catalog/namespace.h"
 +#include "access/hash.h"
 +#ifdef XCP
 +#include "utils/date.h"
 +#include "utils/memutils.h"
 +
 +/*
 + * Locator details are private
 + */
 +struct _Locator
 +{
 +      /*
 +       * Determine target nodes for value.
 +       * Resulting nodes are stored to the results array.
 +       * Function returns number of node references written to the array.
 +       */
 +      int                     (*locatefunc) (Locator *self, Datum value, bool isnull,
 +                                                              bool *hasprimary);
 +      Oid                     dataType;               /* values of that type are passed to locateNodes function */
 +      LocatorListType listType;
 +      bool            primary;
 +      /* locator-specific data */
 +      /* XXX: move them into union ? */
 +      int                     roundRobinNode; /* for LOCATOR_TYPE_RROBIN */
 +      LocatorHashFunc hashfunc; /* for LOCATOR_TYPE_HASH */
 +      int             valuelen; /* 1, 2 or 4 for LOCATOR_TYPE_MODULO */
 +
 +      int                     nodeCount; /* How many nodes are in the map */
 +      void       *nodeMap; /* map index to node reference according to listType */
 +      void       *results; /* array to output results */
 +};
 +#endif
 +
 +Oid           primary_data_node = InvalidOid;
 +int           num_preferred_data_nodes = 0;
 +Oid           preferred_data_node[MAX_PREFERRED_NODES];
 +
 +#ifdef XCP
 +static int modulo_value_len(Oid dataType);
 +static LocatorHashFunc hash_func_ptr(Oid dataType);
 +static int locate_static(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary);
 +static int locate_roundrobin(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary);
 +static int locate_modulo_random(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary);
 +static int locate_hash_insert(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary);
 +static int locate_hash_select(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary);
 +static int locate_modulo_insert(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary);
 +static int locate_modulo_select(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary);
 +static Expr * pgxc_find_distcol_expr(Index varno,
 +                                         AttrNumber attrNum,
 +                                         Node *quals);
 +#endif
 +
 +static const unsigned int xc_mod_m[] =
 +{
 +  0x00000000, 0x55555555, 0x33333333, 0xc71c71c7,
 +  0x0f0f0f0f, 0xc1f07c1f, 0x3f03f03f, 0xf01fc07f,
 +  0x00ff00ff, 0x07fc01ff, 0x3ff003ff, 0xffc007ff,
 +  0xff000fff, 0xfc001fff, 0xf0003fff, 0xc0007fff,
 +  0x0000ffff, 0x0001ffff, 0x0003ffff, 0x0007ffff,
 +  0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff,
 +  0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff,
 +  0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff
 +};
 +
 +static const unsigned int xc_mod_q[][6] =
 +{
 +  { 0,  0,  0,  0,  0,  0}, {16,  8,  4,  2,  1,  1}, {16,  8,  4,  2,  2,  2},
 +  {15,  6,  3,  3,  3,  3}, {16,  8,  4,  4,  4,  4}, {15,  5,  5,  5,  5,  5},
 +  {12,  6,  6,  6 , 6,  6}, {14,  7,  7,  7,  7,  7}, {16,  8,  8,  8,  8,  8},
 +  { 9,  9,  9,  9,  9,  9}, {10, 10, 10, 10, 10, 10}, {11, 11, 11, 11, 11, 11},
 +  {12, 12, 12, 12, 12, 12}, {13, 13, 13, 13, 13, 13}, {14, 14, 14, 14, 14, 14},
 +  {15, 15, 15, 15, 15, 15}, {16, 16, 16, 16, 16, 16}, {17, 17, 17, 17, 17, 17},
 +  {18, 18, 18, 18, 18, 18}, {19, 19, 19, 19, 19, 19}, {20, 20, 20, 20, 20, 20},
 +  {21, 21, 21, 21, 21, 21}, {22, 22, 22, 22, 22, 22}, {23, 23, 23, 23, 23, 23},
 +  {24, 24, 24, 24, 24, 24}, {25, 25, 25, 25, 25, 25}, {26, 26, 26, 26, 26, 26},
 +  {27, 27, 27, 27, 27, 27}, {28, 28, 28, 28, 28, 28}, {29, 29, 29, 29, 29, 29},
 +  {30, 30, 30, 30, 30, 30}, {31, 31, 31, 31, 31, 31}
 +};
 +
 +static const unsigned int xc_mod_r[][6] =
 +{
 +  {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
 +  {0x0000ffff, 0x000000ff, 0x0000000f, 0x00000003, 0x00000001, 0x00000001},
 +  {0x0000ffff, 0x000000ff, 0x0000000f, 0x00000003, 0x00000003, 0x00000003},
 +  {0x00007fff, 0x0000003f, 0x00000007, 0x00000007, 0x00000007, 0x00000007},
 +  {0x0000ffff, 0x000000ff, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f},
 +  {0x00007fff, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f},
 +  {0x00000fff, 0x0000003f, 0x0000003f, 0x0000003f, 0x0000003f, 0x0000003f},
 +  {0x00003fff, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f},
 +  {0x0000ffff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff},
 +  {0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff},
 +  {0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff},
 +  {0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff},
 +  {0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff},
 +  {0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff},
 +  {0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff},
 +  {0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff},
 +  {0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff},
 +  {0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff},
 +  {0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff},
 +  {0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff},
 +  {0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff},
 +  {0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff},
 +  {0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff},
 +  {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff},
 +  {0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff},
 +  {0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff},
 +  {0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff},
 +  {0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff},
 +  {0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff},
 +  {0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff},
 +  {0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff},
 +  {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}
 +};
 +
 +/*
 + * GetPreferredReplicationNode
 + * Pick any Datanode from given list, however fetch a preferred node first.
 + */
 +List *
 +GetPreferredReplicationNode(List *relNodes)
 +{
 +      ListCell        *item;
 +      int                     nodeid = -1;
 +
 +      if (list_length(relNodes) <= 0)
 +              elog(ERROR, "a list of nodes should have at least one node");
 +
 +      foreach(item, relNodes)
 +      {
 +              int cnt_nodes;
 +              char nodetype = PGXC_NODE_DATANODE;
 +              for (cnt_nodes = 0;
 +                              cnt_nodes < num_preferred_data_nodes && nodeid < 0;
 +                              cnt_nodes++)
 +              {
 +                      if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes],
 +                                                                &nodetype) == lfirst_int(item))
 +                              nodeid = lfirst_int(item);
 +              }
 +              if (nodeid >= 0)
 +                      break;
 +      }
 +      if (nodeid < 0)
 +              return list_make1_int(list_nth_int(relNodes,
 +                                      ((unsigned int) random()) % list_length(relNodes)));
 +
 +      return list_make1_int(nodeid);
 +}
 +
 +/*
 + * GetAnyDataNode
 + * Pick any data node from given set, but try a preferred node
 + */
 +int
 +GetAnyDataNode(Bitmapset *nodes)
 +{
 +      Bitmapset  *preferred = NULL;
 +      int                     i, nodeid;
 +      int                     nmembers = 0;
 +      int                     members[NumDataNodes];
 +
 +      for (i = 0; i < num_preferred_data_nodes; i++)
 +      {
 +              char ntype = PGXC_NODE_DATANODE;
 +              nodeid = PGXCNodeGetNodeId(preferred_data_node[i], &ntype);
 +
 +              /* OK, found one */
 +              if (bms_is_member(nodeid, nodes))
 +                      preferred = bms_add_member(preferred, nodeid);
 +      }
 +
 +      /*
 +       * If no preferred data nodes or they are not in the desired set, pick up
 +       * from the original set.
 +       */
 +      if (bms_is_empty(preferred))
 +              preferred = bms_copy(nodes);
 +
 +      /*
 +       * Load balance.
 +       * We can not get item from the set, convert it to array
 +       */
 +      while ((nodeid = bms_first_member(preferred)) >= 0)
 +              members[nmembers++] = nodeid;
 +      bms_free(preferred);
 +
 +      /* If there is a single member nothing to balance */
 +      if (nmembers == 1)
 +              return members[0];
 +
 +      /*
 +       * In general, the set may contain any number of nodes, and if we save
 +       * previous returned index for load balancing the distribution won't be
 +       * flat, because small set will probably reset saved value, and lower
 +       * indexes will be picked up more often.
 +       * So we just get a random value from 0..nmembers-1.
 +       */
 +      return members[((unsigned int) random()) % nmembers];
 +}
 +
 +/*
 + * compute_modulo
 + * This function performs modulo in an optimized way
 + * It optimizes modulo of any positive number by
 + * 1,2,3,4,7,8,15,16,31,32,63,64 and so on
 + * for the rest of the denominators it uses % operator
 + * The optimized algos have been taken from
 + * http://www-graphics.stanford.edu/~seander/bithacks.html
 + */
 +static int
 +compute_modulo(unsigned int numerator, unsigned int denominator)
 +{
 +      unsigned int d;
 +      unsigned int m;
 +      unsigned int s;
 +      unsigned int mask;
 +      int k;
 +      unsigned int q, r;
 +
 +      if (numerator == 0)
 +              return 0;
 +
 +      /* Check if denominator is a power of 2 */
 +      if ((denominator & (denominator - 1)) == 0)
 +              return numerator & (denominator - 1);
 +
 +      /* Check if (denominator+1) is a power of 2 */
 +      d = denominator + 1;
 +      if ((d & (d - 1)) == 0)
 +      {
 +              /* Which power of 2 is this number */
 +              s = 0;
 +              mask = 0x01;
 +              for (k = 0; k < 32; k++)
 +              {
 +                      if ((d & mask) == mask)
 +                              break;
 +                      s++;
 +                      mask = mask << 1;
 +              }
 +
 +              m = (numerator & xc_mod_m[s]) + ((numerator >> s) & xc_mod_m[s]);
 +
 +              for (q = 0, r = 0; m > denominator; q++, r++)
 +                      m = (m >> xc_mod_q[s][q]) + (m & xc_mod_r[s][r]);
 +
 +              m = m == denominator ? 0 : m;
 +
 +              return m;
 +      }
 +      return numerator % denominator;
 +}
 +
 +/*
 + * GetRelationDistColumn - Returns the name of the hash or modulo distribution column
 + * First hash distribution is checked
 + * Retuens NULL if the table is neither hash nor modulo distributed
 + */
 +char *
 +GetRelationDistColumn(RelationLocInfo * rel_loc_info)
 +{
 +char *pColName;
 +
 +      pColName = NULL;
 +
 +      pColName = GetRelationHashColumn(rel_loc_info);
 +      if (pColName == NULL)
 +              pColName = GetRelationModuloColumn(rel_loc_info);
 +
 +      return pColName;
 +}
 +
 +/*
 + * Returns whether or not the data type is hash distributable with PG-XC
 + * PGXCTODO - expand support for other data types!
 + */
 +bool
 +IsTypeHashDistributable(Oid col_type)
 +{
 +      return (hash_func_ptr(col_type) != NULL);
 +}
 +
 +/*
 + * GetRelationHashColumn - return hash column for relation.
 + *
 + * Returns NULL if the relation is not hash partitioned.
 + */
 +char *
 +GetRelationHashColumn(RelationLocInfo * rel_loc_info)
 +{
 +      char       *column_str = NULL;
 +
 +      if (rel_loc_info == NULL)
 +              column_str = NULL;
 +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
 +              column_str = NULL;
 +      else
 +      {
 +              int                     len = strlen(rel_loc_info->partAttrName);
 +
 +              column_str = (char *) palloc(len + 1);
 +              strncpy(column_str, rel_loc_info->partAttrName, len + 1);
 +      }
 +
 +      return column_str;
 +}
 +
 +/*
 + * IsHashColumn - return whether or not column for relation is hashed.
 + *
 + */
 +bool
 +IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
 +{
 +      bool            ret_value = false;
 +
 +      if (!rel_loc_info || !part_col_name)
 +              ret_value = false;
 +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
 +              ret_value = false;
 +      else
 +              ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
 +
 +      return ret_value;
 +}
 +
 +
 +/*
 + * IsHashColumnForRelId - return whether or not column for relation is hashed.
 + *
 + */
 +bool
 +IsHashColumnForRelId(Oid relid, char *part_col_name)
 +{
 +      RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
 +
 +      return IsHashColumn(rel_loc_info, part_col_name);
 +}
 +
 +/*
 + * IsDistColumnForRelId - return whether or not column for relation is used for hash or modulo distribution
 + *
 + */
 +bool
 +IsDistColumnForRelId(Oid relid, char *part_col_name)
 +{
 +      bool bRet;
 +      RelationLocInfo *rel_loc_info;
 +
 +      rel_loc_info = GetRelationLocInfo(relid);
 +      bRet = false;
 +
 +      bRet = IsHashColumn(rel_loc_info, part_col_name);
 +      if (bRet == false)
 +              IsModuloColumn(rel_loc_info, part_col_name);
 +      return bRet;
 +}
 +
 +
 +/*
 + * Returns whether or not the data type is modulo distributable with PG-XC
 + * PGXCTODO - expand support for other data types!
 + */
 +bool
 +IsTypeModuloDistributable(Oid col_type)
 +{
 +      return (modulo_value_len(col_type) != -1);
 +}
 +
 +/*
 + * GetRelationModuloColumn - return modulo column for relation.
 + *
 + * Returns NULL if the relation is not modulo partitioned.
 + */
 +char *
 +GetRelationModuloColumn(RelationLocInfo * rel_loc_info)
 +{
 +      char       *column_str = NULL;
 +
 +      if (rel_loc_info == NULL)
 +              column_str = NULL;
 +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
 +              column_str = NULL;
 +      else
 +      {
 +              int     len = strlen(rel_loc_info->partAttrName);
 +
 +              column_str = (char *) palloc(len + 1);
 +              strncpy(column_str, rel_loc_info->partAttrName, len + 1);
 +      }
 +
 +      return column_str;
 +}
 +
 +/*
 + * IsModuloColumn - return whether or not column for relation is used for modulo distribution.
 + *
 + */
 +bool
 +IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
 +{
 +      bool            ret_value = false;
 +
 +      if (!rel_loc_info || !part_col_name)
 +              ret_value = false;
 +      else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
 +              ret_value = false;
 +      else
 +              ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
 +
 +      return ret_value;
 +}
 +
 +
 +/*
 + * IsModuloColumnForRelId - return whether or not column for relation is used for modulo distribution.
 + */
 +bool
 +IsModuloColumnForRelId(Oid relid, char *part_col_name)
 +{
 +      RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
 +
 +      return IsModuloColumn(rel_loc_info, part_col_name);
 +}
 +
 +/*
 + * Update the round robin node for the relation
 + *
 + * PGXCTODO - may not want to bother with locking here, we could track
 + * these in the session memory context instead...
 + */
 +int
 +GetRoundRobinNode(Oid relid)
 +{
 +      int                     ret_node;
 +      Relation        rel = relation_open(relid, AccessShareLock);
 +
 +    Assert (IsLocatorReplicated(rel->rd_locator_info->locatorType) ||
 +                      rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN);
 +
 +      ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode);
 +
 +      /* Move round robin indicator to next node */
 +      if (rel->rd_locator_info->roundRobinNode->next != NULL)
 +              rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next;
 +      else
 +              /* reset to first one */
 +              rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->rl_nodeList->head;
 +
 +      relation_close(rel, AccessShareLock);
 +
 +      return ret_node;
 +}
 +
 +/*
 + * IsTableDistOnPrimary
 + *
 + * Does the table distribution list include the primary node?
 + */
 +bool
 +IsTableDistOnPrimary(RelationLocInfo *rel_loc_info)
 +{
 +      ListCell *item;
 +
 +      if (!OidIsValid(primary_data_node) ||
 +              rel_loc_info == NULL ||
 +              list_length(rel_loc_info->rl_nodeList = 0))
 +              return false;
 +
 +      foreach(item, rel_loc_info->rl_nodeList)
 +      {
 +              char ntype = PGXC_NODE_DATANODE;
 +              if (PGXCNodeGetNodeId(primary_data_node, &ntype) == lfirst_int(item))
 +                      return true;
 +      }
 +      return false;
 +}
 +
 +
 +/*
 + * IsLocatorInfoEqual
 + * Check equality of given locator information
 + */
 +bool
 +IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2)
 +{
 +      List *nodeList1, *nodeList2;
 +      Assert(rel_loc_info1 && rel_loc_info2);
 +
 +      nodeList1 = rel_loc_info1->rl_nodeList;
 +      nodeList2 = rel_loc_info2->rl_nodeList;
 +
 +      /* Same relation? */
 +      if (rel_loc_info1->relid != rel_loc_info2->relid)
 +              return false;
 +
 +      /* Same locator type? */
 +      if (rel_loc_info1->locatorType != rel_loc_info2->locatorType)
 +              return false;
 +
 +      /* Same attribute number? */
 +      if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum)
 +              return false;
 +
 +      /* Same node list? */
 +      if (list_difference_int(nodeList1, nodeList2) != NIL ||
 +              list_difference_int(nodeList2, nodeList1) != NIL)
 +              return false;
 +
 +      /* Everything is equal */
 +      return true;
 +}
 +
 +/*
 + * ConvertToLocatorType
 + *            get locator distribution type
 + * We really should just have pgxc_class use disttype instead...
 + */
 +char
 +ConvertToLocatorType(int disttype)
 +{
 +      char            loctype = LOCATOR_TYPE_NONE;
 +
 +      switch (disttype)
 +      {
 +              case DISTTYPE_HASH:
 +                      loctype = LOCATOR_TYPE_HASH;
 +                      break;
 +              case DISTTYPE_ROUNDROBIN:
 +                      loctype = LOCATOR_TYPE_RROBIN;
 +                      break;
 +              case DISTTYPE_REPLICATION:
 +                      loctype = LOCATOR_TYPE_REPLICATED;
 +                      break;
 +              case DISTTYPE_MODULO:
 +                      loctype = LOCATOR_TYPE_MODULO;
 +                      break;
 +              default:
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_WRONG_OBJECT_TYPE),
 +                                       errmsg("Invalid distribution type")));
 +                      break;
 +      }
 +
 +      return loctype;
 +}
 +
 +
 +/*
 + * GetLocatorType - Returns the locator type of the table
 + *
 + */
 +char
 +GetLocatorType(Oid relid)
 +{
 +      char            ret = '\0';
 +
 +      RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid);
 +
 +      if (ret_loc_info != NULL)
 +              ret = ret_loc_info->locatorType;
 +
 +      return ret;
 +}
 +
 +
 +/*
 + * Return a list of all Datanodes.
 + * We assume all tables use all nodes in the prototype, so just return a list
 + * from first one.
 + */
 +List *
 +GetAllDataNodes(void)
 +{
 +      int                     i;
 +      List       *nodeList = NIL;
 +
 +      for (i = 0; i < NumDataNodes; i++)
 +              nodeList = lappend_int(nodeList, i);
 +
 +      return nodeList;
 +}
 +
 +/*
 + * Return a list of all Coordinators
 + * This is used to send DDL to all nodes and to clean up pooler connections.
 + * Do not put in the list the local Coordinator where this function is launched.
 + */
 +List *
 +GetAllCoordNodes(void)
 +{
 +      int                     i;
 +      List       *nodeList = NIL;
 +
 +      for (i = 0; i < NumCoords; i++)
 +      {
 +              /*
 +               * Do not put in list the Coordinator we are on,
 +               * it doesn't make sense to connect to the local Coordinator.
 +               */
 +
 +              if (i != PGXCNodeId - 1)
 +                      nodeList = lappend_int(nodeList, i);
 +      }
 +
 +      return nodeList;
 +}
 +
 +
 +/*
 + * Build locator information associated with the specified relation.
 + */
 +void
 +RelationBuildLocator(Relation rel)
 +{
 +      Relation        pcrel;
 +      ScanKeyData     skey;
 +      SysScanDesc     pcscan;
 +      HeapTuple       htup;
 +      MemoryContext   oldContext;
 +      RelationLocInfo *relationLocInfo;
 +      int             j;
 +      Form_pgxc_class pgxc_class;
 +
 +      ScanKeyInit(&skey,
 +                              Anum_pgxc_class_pcrelid,
 +                              BTEqualStrategyNumber, F_OIDEQ,
 +                              ObjectIdGetDatum(RelationGetRelid(rel)));
 +
 +      pcrel = heap_open(PgxcClassRelationId, AccessShareLock);
 +      pcscan = systable_beginscan(pcrel, PgxcClassPgxcRelIdIndexId, true,
 +                                                              SnapshotSelf, 1, &skey);
 +      htup = systable_getnext(pcscan);
 +
 +      if (!HeapTupleIsValid(htup))
 +      {
 +              /* Assume local relation only */
 +              rel->rd_locator_info = NULL;
 +              systable_endscan(pcscan);
 +              heap_close(pcrel, AccessShareLock);
 +              return;
 +      }
 +
 +      pgxc_class = (Form_pgxc_class) GETSTRUCT(htup);
 +
 +      oldContext = MemoryContextSwitchTo(CacheMemoryContext);
 +
 +      relationLocInfo = (RelationLocInfo *) palloc(sizeof(RelationLocInfo));
 +      rel->rd_locator_info = relationLocInfo;
 +
 +      relationLocInfo->relid = RelationGetRelid(rel);
 +      relationLocInfo->locatorType = pgxc_class->pclocatortype;
 +
 +      relationLocInfo->partAttrNum = pgxc_class->pcattnum;
 +
 +      relationLocInfo->partAttrName = get_attname(relationLocInfo->relid, pgxc_class->pcattnum);
 +
 +      relationLocInfo->rl_nodeList = NIL;
 +
 +      for (j = 0; j < pgxc_class->nodeoids.dim1; j++)
 +      {
 +              char ntype = PGXC_NODE_DATANODE;
 +              int nid = PGXCNodeGetNodeId(pgxc_class->nodeoids.values[j], &ntype);
 +              relationLocInfo->rl_nodeList = lappend_int(relationLocInfo->rl_nodeList, nid);
 +      }
 +
 +      /*
 +       * If the locator type is round robin, we set a node to
 +       * use next time. In addition, if it is replicated,
 +       * we choose a node to use for balancing reads.
 +       */
 +      if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN
 +              || IsLocatorReplicated(relationLocInfo->locatorType))
 +      {
 +              int offset;
 +              /*
 +               * pick a random one to start with,
 +               * since each process will do this independently
 +               */
 +              offset = compute_modulo(abs(rand()), list_length(relationLocInfo->rl_nodeList));
 +
 +              srand(time(NULL));
 +              relationLocInfo->roundRobinNode = relationLocInfo->rl_nodeList->head; /* initialize */
 +              for (j = 0; j < offset && relationLocInfo->roundRobinNode->next != NULL; j++)
 +                      relationLocInfo->roundRobinNode = relationLocInfo->roundRobinNode->next;
 +      }
 +
 +      systable_endscan(pcscan);
 +      heap_close(pcrel, AccessShareLock);
 +
 +      MemoryContextSwitchTo(oldContext);
 +}
 +
 +/*
 + * GetLocatorRelationInfo - Returns the locator information for relation,
 + * in a copy of the RelationLocatorInfo struct in relcache
 + */
 +RelationLocInfo *
 +GetRelationLocInfo(Oid relid)
 +{
 +      RelationLocInfo *ret_loc_info = NULL;
 +      Relation        rel = relation_open(relid, AccessShareLock);
 +
 +      /* Relation needs to be valid */
 +      Assert(rel->rd_isvalid);
 +
 +      if (rel->rd_locator_info)
 +              ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info);
 +
 +      relation_close(rel, AccessShareLock);
 +
 +      return ret_loc_info;
 +}
 +
 +/*
 + * Get the distribution type of relation.
 + */
 +char
 +GetRelationLocType(Oid relid)
 +{
 +      RelationLocInfo *locinfo = GetRelationLocInfo(relid);
 +      if (!locinfo)
 +              return LOCATOR_TYPE_NONE;
 +
 +      return locinfo->locatorType;
 +}
 +
 +/*
 + * Copy the RelationLocInfo struct
 + */
 +RelationLocInfo *
 +CopyRelationLocInfo(RelationLocInfo * src_info)
 +{
 +      RelationLocInfo *dest_info;
 +
 +      Assert(src_info);
 +
 +      dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo));
 +
 +      dest_info->relid = src_info->relid;
 +      dest_info->locatorType = src_info->locatorType;
 +      dest_info->partAttrNum = src_info->partAttrNum;
 +      if (src_info->partAttrName)
 +              dest_info->partAttrName = pstrdup(src_info->partAttrName);
 +
 +      if (src_info->rl_nodeList)
 +              dest_info->rl_nodeList = list_copy(src_info->rl_nodeList);
 +      /* Note, for round robin, we use the relcache entry */
 +
 +      return dest_info;
 +}
 +
 +
 +/*
 + * Free RelationLocInfo struct
 + */
 +void
 +FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
 +{
 +      if (relationLocInfo)
 +      {
 +              if (relationLocInfo->partAttrName)
 +                      pfree(relationLocInfo->partAttrName);
 +              pfree(relationLocInfo);
 +      }
 +}
 +
 +
 +/*
 + * Free the contents of the ExecNodes expression */
 +void
 +FreeExecNodes(ExecNodes **exec_nodes)
 +{
 +      ExecNodes *tmp_en = *exec_nodes;
 +
 +      /* Nothing to do */
 +      if (!tmp_en)
 +              return;
 +      list_free(tmp_en->primarynodelist);
 +      list_free(tmp_en->nodeList);
 +      pfree(tmp_en);
 +      *exec_nodes = NULL;
 +}
 +
 +
 +#ifdef XCP
 +/*
 + * Determine value length in bytes for specified type for a module locator.
 + * Return -1 if module locator is not supported for the type.
 + */
 +static int
 +modulo_value_len(Oid dataType)
 +{
 +      switch (dataType)
 +      {
 +              case BOOLOID:
 +              case CHAROID:
 +                      return 1;
 +              case INT2OID:
 +                      return 2;
 +              case INT4OID:
 +              case ABSTIMEOID:
 +              case RELTIMEOID:
 +              case DATEOID:
 +                      return 4;
 +              default:
 +                      return -1;
 +      }
 +}
 +
 +
 +static LocatorHashFunc
 +hash_func_ptr(Oid dataType)
 +{
 +      switch (dataType)
 +      {
 +              case INT8OID:
 +              case CASHOID:
 +                      return hashint8;
 +              case INT2OID:
 +                      return hashint2;
 +              case OIDOID:
 +                      return hashoid;
 +              case INT4OID:
 +              case ABSTIMEOID:
 +              case RELTIMEOID:
 +              case DATEOID:
 +                      return hashint4;
 +              case BOOLOID:
 +              case CHAROID:
 +                      return hashchar;
 +              case NAMEOID:
 +                      return hashname;
 +              case VARCHAROID:
 +              case TEXTOID:
 +                      return hashtext;
 +              case OIDVECTOROID:
 +                      return hashoidvector;
 +              case BPCHAROID:
 +                      return hashbpchar;
 +              case BYTEAOID:
 +                      return hashvarlena;
 +              case TIMEOID:
 +                      return time_hash;
 +              case TIMESTAMPOID:
 +              case TIMESTAMPTZOID:
 +                      return timestamp_hash;
 +              case INTERVALOID:
 +                      return interval_hash;
 +              case TIMETZOID:
 +                      return timetz_hash;
 +              case NUMERICOID:
 +                      return hash_numeric;
 +              case UUIDOID:
 +                      return uuid_hash;
 +              default:
 +                      return NULL;
 +      }
 +}
 +
 +
 +Locator *
 +createLocator(char locatorType, RelationAccessType accessType,
 +                        Oid dataType, LocatorListType listType, int nodeCount,
 +                        void *nodeList, void **result, bool primary)
 +{
 +      Locator    *locator;
 +      ListCell   *lc;
 +      void       *nodeMap = NULL;
 +      int             i;
 +
 +      locator = (Locator *) palloc(sizeof(Locator));
 +      locator->dataType = dataType;
 +      locator->listType = listType;
 +      locator->nodeCount = nodeCount;
 +      /* Create node map */
 +      switch (listType)
 +      {
 +              case LOCATOR_LIST_NONE:
 +                      /* No map, return indexes */
 +                      break;
 +              case LOCATOR_LIST_INT:
 +                      /* Copy integer array */
 +                      nodeMap = palloc(nodeCount * sizeof(int));
 +                      memcpy(nodeMap, nodeList, nodeCount * sizeof(int));
 +                      break;
 +              case LOCATOR_LIST_OID:
 +                      /* Copy array of Oids */
 +                      nodeMap = palloc(nodeCount * sizeof(Oid));
 +                      memcpy(nodeMap, nodeList, nodeCount * sizeof(Oid));
 +                      break;
 +              case LOCATOR_LIST_POINTER:
 +                      /* Copy array of Oids */
 +                      nodeMap = palloc(nodeCount * sizeof(void *));
 +                      memcpy(nodeMap, nodeList, nodeCount * sizeof(void *));
 +                      break;
 +              case LOCATOR_LIST_LIST:
 +                      /* Create map from list */
 +              {
 +                      List *l = (List *) nodeList;
 +                      locator->nodeCount = list_length(l);
 +                      if (IsA(l, IntList))
 +                      {
 +                              int *intptr;
 +                              nodeMap = palloc(locator->nodeCount * sizeof(int));
 +                              intptr = (int *) nodeMap;
 +                              foreach(lc, l)
 +                                      *intptr++ = lfirst_int(lc);
 +                              locator->listType = LOCATOR_LIST_INT;
 +                      }
 +                      else if (IsA(l, OidList))
 +                      {
 +                              Oid *oidptr;
 +                              nodeMap = palloc(locator->nodeCount * sizeof(Oid));
 +                              oidptr = (Oid *) nodeMap;
 +                              foreach(lc, l)
 +                                      *oidptr++ = lfirst_oid(lc);
 +                              locator->listType = LOCATOR_LIST_OID;
 +                      }
 +                      else if (IsA(l, List))
 +                      {
 +                              void **voidptr;
 +                              nodeMap = palloc(locator->nodeCount * sizeof(void *));
 +                              voidptr = (void **) nodeMap;
 +                              foreach(lc, l)
 +                                      *voidptr++ = lfirst(lc);
 +                              locator->listType = LOCATOR_LIST_POINTER;
 +                      }
 +                      else
 +                      {
 +                              /* can not get here */
 +                              Assert(false);
 +                      }
 +                      break;
 +              }
 +      }
 +      /*
 +       * Determine locatefunc, allocate results, set up parameters
 +       * specific to locator type
 +       */
 +      switch (locatorType)
 +      {
 +              case LOCATOR_TYPE_REPLICATED:
 +                      if (accessType == RELATION_ACCESS_INSERT ||
 +                                      accessType == RELATION_ACCESS_UPDATE ||
 +                                      accessType == RELATION_ACCESS_READ_FQS)
 +                      {
 +                              locator->locatefunc = locate_static;
 +                              if (nodeMap == NULL)
 +                              {
 +                                      /* no map, prepare array with indexes */
 +                                      int *intptr;
 +                                      nodeMap = palloc(locator->nodeCount * sizeof(int));
 +                                      intptr = (int *) nodeMap;
 +                                      for (i = 0; i < locator->nodeCount; i++)
 +                                              *intptr++ = i;
 +                              }
 +                              locator->nodeMap = nodeMap;
 +                              locator->results = nodeMap;
 +                      }
 +                      else
 +                      {
 +                              /* SELECT, use random node.. */
 +                              locator->locatefunc = locate_modulo_random;
 +                              locator->nodeMap = nodeMap;
 +                              switch (locator->listType)
 +                              {
 +                                      case LOCATOR_LIST_NONE:
 +                                      case LOCATOR_LIST_INT:
 +                                              locator->results = palloc(sizeof(int));
 +                                              break;
 +                                      case LOCATOR_LIST_OID:
 +                                              locator->results = palloc(sizeof(Oid));
 +                                              break;
 +                                      case LOCATOR_LIST_POINTER:
 +                                              locator->results = palloc(sizeof(void *));
 +                                              break;
 +                                      case LOCATOR_LIST_LIST:
 +                                              /* Should never happen */
 +                                              Assert(false);
 +                                              break;
 +                              }
 +                              locator->roundRobinNode = -1;
 +                      }
 +                      break;
 +              case LOCATOR_TYPE_RROBIN:
 +                      if (accessType == RELATION_ACCESS_INSERT)
 +                      {
 +                              locator->locatefunc = locate_roundrobin;
 +                              locator->nodeMap = nodeMap;
 +                              switch (locator->listType)
 +                              {
 +                                      case LOCATOR_LIST_NONE:
 +                                      case LOCATOR_LIST_INT:
 +                                              locator->results = palloc(sizeof(int));
 +                                              break;
 +                                      case LOCATOR_LIST_OID:
 +                                              locator->results = palloc(sizeof(Oid));
 +                                              break;
 +                                      case LOCATOR_LIST_POINTER:
 +                                              locator->results = palloc(sizeof(void *));
 +                                              break;
 +                                      case LOCATOR_LIST_LIST:
 +                                              /* Should never happen */
 +                                              Assert(false);
 +                                              break;
 +                              }
 +                              locator->roundRobinNode = -1;
 +                      }
 +                      else
 +                      {
 +                              locator->locatefunc = locate_static;
 +                              if (nodeMap == NULL)
 +                              {
 +                                      /* no map, prepare array with indexes */
 +                                      int *intptr;
 +                                      nodeMap = palloc(locator->nodeCount * sizeof(int));
 +                                      intptr = (int *) nodeMap;
 +                                      for (i = 0; i < locator->nodeCount; i++)
 +                                              *intptr++ = i;
 +                              }
 +                              locator->nodeMap = nodeMap;
 +                              locator->results = nodeMap;
 +                      }
 +                      break;
 +              case LOCATOR_TYPE_HASH:
 +                      if (accessType == RELATION_ACCESS_INSERT)
 +                      {
 +                              locator->locatefunc = locate_hash_insert;
 +                              locator->nodeMap = nodeMap;
 +                              switch (locator->listType)
 +                              {
 +                                      case LOCATOR_LIST_NONE:
 +                                      case LOCATOR_LIST_INT:
 +                                              locator->results = palloc(sizeof(int));
 +                                              break;
 +                                      case LOCATOR_LIST_OID:
 +                                              locator->results = palloc(sizeof(Oid));
 +                                              break;
 +                                      case LOCATOR_LIST_POINTER:
 +                                              locator->results = palloc(sizeof(void *));
 +                                              break;
 +                                      case LOCATOR_LIST_LIST:
 +                                              /* Should never happen */
 +                                              Assert(false);
 +                                              break;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              locator->locatefunc = locate_hash_select;
 +                              locator->nodeMap = nodeMap;
 +                              switch (locator->listType)
 +                              {
 +                                      case LOCATOR_LIST_NONE:
 +                                      case LOCATOR_LIST_INT:
 +                                              locator->results = palloc(locator->nodeCount * sizeof(int));
 +                                              break;
 +                                      case LOCATOR_LIST_OID:
 +                                              locator->results = palloc(locator->nodeCount * sizeof(Oid));
 +                                              break;
 +                                      case LOCATOR_LIST_POINTER:
 +                                              locator->results = palloc(locator->nodeCount * sizeof(void *));
 +                                              break;
 +                                      case LOCATOR_LIST_LIST:
 +                                              /* Should never happen */
 +                                              Assert(false);
 +                                              break;
 +                              }
 +                      }
 +
 +                      locator->hashfunc = hash_func_ptr(dataType);
 +                      if (locator->hashfunc == NULL)
 +                              ereport(ERROR, (errmsg("Error: unsupported data type for HASH locator: %d\n",
 +                                                                 dataType)));
 +                      break;
 +              case LOCATOR_TYPE_MODULO:
 +                      if (accessType == RELATION_ACCESS_INSERT)
 +                      {
 +                              locator->locatefunc = locate_modulo_insert;
 +                              locator->nodeMap = nodeMap;
 +                              switch (locator->listType)
 +                              {
 +                                      case LOCATOR_LIST_NONE:
 +                                      case LOCATOR_LIST_INT:
 +                                              locator->results = palloc(sizeof(int));
 +                                              break;
 +                                      case LOCATOR_LIST_OID:
 +                                              locator->results = palloc(sizeof(Oid));
 +                                              break;
 +                                      case LOCATOR_LIST_POINTER:
 +                                              locator->results = palloc(sizeof(void *));
 +                                              break;
 +                                      case LOCATOR_LIST_LIST:
 +                                              /* Should never happen */
 +                                              Assert(false);
 +                                              break;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              locator->locatefunc = locate_modulo_select;
 +                              locator->nodeMap = nodeMap;
 +                              switch (locator->listType)
 +                              {
 +                                      case LOCATOR_LIST_NONE:
 +                                      case LOCATOR_LIST_INT:
 +                                              locator->results = palloc(locator->nodeCount * sizeof(int));
 +                                              break;
 +                                      case LOCATOR_LIST_OID:
 +                                              locator->results = palloc(locator->nodeCount * sizeof(Oid));
 +                                              break;
 +                                      case LOCATOR_LIST_POINTER:
 +                                              locator->results = palloc(locator->nodeCount * sizeof(void *));
 +                                              break;
 +                                      case LOCATOR_LIST_LIST:
 +                                              /* Should never happen */
 +                                              Assert(false);
 +                                              break;
 +                              }
 +                      }
 +
 +                      locator->valuelen = modulo_value_len(dataType);
 +                      if (locator->valuelen == -1)
 +                              ereport(ERROR, (errmsg("Error: unsupported data type for MODULO locator: %d\n",
 +                                                                 dataType)));
 +                      break;
 +              default:
 +                      ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n",
 +                                                                 locatorType)));
 +      }
 +
 +      if (result)
 +              *result = locator->results;
 +
 +      return locator;
 +}
 +
 +
 +void
 +freeLocator(Locator *locator)
 +{
 +      pfree(locator->nodeMap);
 +      /*
 +       * locator->nodeMap and locator->results may point to the same memory,
 +       * do not free it twice
 +       */
 +      if (locator->results != locator->nodeMap)
 +              pfree(locator->results);
 +      pfree(locator);
 +}
 +
 +
 +/*
 + * Each time return the same predefined results
 + */
 +static int
 +locate_static(Locator *self, Datum value, bool isnull,
 +                        bool *hasprimary)
 +{
 +      /* TODO */
 +      if (hasprimary)
 +              *hasprimary = false;
 +      return self->nodeCount;
 +}
 +
 +
 +/*
 + * Each time return one next node, in round robin manner
 + */
 +static int
 +locate_roundrobin(Locator *self, Datum value, bool isnull,
 +                                bool *hasprimary)
 +{
 +      /* TODO */
 +      if (hasprimary)
 +              *hasprimary = false;
 +      if (++self->roundRobinNode >= self->nodeCount)
 +              self->roundRobinNode = 0;
 +      switch (self->listType)
 +      {
 +              case LOCATOR_LIST_NONE:
 +                      ((int *) self->results)[0] = self->roundRobinNode;
 +                      break;
 +              case LOCATOR_LIST_INT:
 +                      ((int *) self->results)[0] =
 +                                      ((int *) self->nodeMap)[self->roundRobinNode];
 +                      break;
 +              case LOCATOR_LIST_OID:
 +                      ((Oid *) self->results)[0] =
 +                                      ((Oid *) self->nodeMap)[self->roundRobinNode];
 +                      break;
 +              case LOCATOR_LIST_POINTER:
 +                      ((void **) self->results)[0] =
 +                                      ((void **) self->nodeMap)[self->roundRobinNode];
 +                      break;
 +              case LOCATOR_LIST_LIST:
 +                      /* Should never happen */
 +                      Assert(false);
 +                      break;
 +      }
 +      return 1;
 +}
 +
 +/*
 + * Each time return one node, in a random manner
 + * This is similar to locate_modulo_select, but that
 + * function does not use a random modulo..
 + */
 +static int
 +locate_modulo_random(Locator *self, Datum value, bool isnull,
 +                                bool *hasprimary)
 +{
 +      int offset;
 +
 +      if (hasprimary)
 +              *hasprimary = false;
 +
 +      Assert(self->nodeCount > 0);
 +      offset = compute_modulo(abs(rand()), self->nodeCount);
 +      switch (self->listType)
 +      {
 +              case LOCATOR_LIST_NONE:
 +                      ((int *) self->results)[0] = offset;
 +                      break;
 +              case LOCATOR_LIST_INT:
 +                      ((int *) self->results)[0] =
 +                                      ((int *) self->nodeMap)[offset];
 +                      break;
 +              case LOCATOR_LIST_OID:
 +                      ((Oid *) self->results)[0] =
 +                                      ((Oid *) self->nodeMap)[offset];
 +                      break;
 +              case LOCATOR_LIST_POINTER:
 +                      ((void **) self->results)[0] =
 +                                      ((void **) self->nodeMap)[offset];
 +                      break;
 +              case LOCATOR_LIST_LIST:
 +                      /* Should never happen */
 +                      Assert(false);
 +                      break;
 +      }
 +      return 1;
 +}
 +
 +/*
 + * Calculate hash from supplied value and use modulo by nodeCount as an index
 + */
 +static int
 +locate_hash_insert(Locator *self, Datum value, bool isnull,
 +                                 bool *hasprimary)
 +{
 +      int index;
 +      if (hasprimary)
 +              *hasprimary = false;
 +      if (isnull)
 +              index = 0;
 +      else
 +      {
 +              unsigned int hash32;
 +
 +              hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
 +
 +              index = compute_modulo(hash32, self->nodeCount);
 +      }
 +      switch (self->listType)
 +      {
 +              case LOCATOR_LIST_NONE:
 +                      ((int *) self->results)[0] = index;
 +                      break;
 +              case LOCATOR_LIST_INT:
 +                      ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
 +                      break;
 +              case LOCATOR_LIST_OID:
 +                      ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
 +                      break;
 +              case LOCATOR_LIST_POINTER:
 +                      ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
 +                      break;
 +              case LOCATOR_LIST_LIST:
 +                      /* Should never happen */
 +                      Assert(false);
 +                      break;
 +      }
 +      return 1;
 +}
 +
 +
 +/*
 + * Calculate hash from supplied value and use modulo by nodeCount as an index
 + * if value is NULL assume no hint and return all the nodes.
 + */
 +static int
 +locate_hash_select(Locator *self, Datum value, bool isnull,
 +                                 bool *hasprimary)
 +{
 +      if (hasprimary)
 +              *hasprimary = false;
 +      if (isnull)
 +      {
 +              int i;
 +              switch (self->listType)
 +              {
 +                      case LOCATOR_LIST_NONE:
 +                              for (i = 0; i < self->nodeCount; i++)
 +                                      ((int *) self->results)[i] = i;
 +                              break;
 +                      case LOCATOR_LIST_INT:
 +                              memcpy(self->results, self->nodeMap,
 +                                         self->nodeCount * sizeof(int));
 +                              break;
 +                      case LOCATOR_LIST_OID:
 +                              memcpy(self->results, self->nodeMap,
 +                                         self->nodeCount * sizeof(Oid));
 +                              break;
 +                      case LOCATOR_LIST_POINTER:
 +                              memcpy(self->results, self->nodeMap,
 +                                         self->nodeCount * sizeof(void *));
 +                              break;
 +                      case LOCATOR_LIST_LIST:
 +                              /* Should never happen */
 +                              Assert(false);
 +                              break;
 +              }
 +              return self->nodeCount;
 +      }
 +      else
 +      {
 +              unsigned int hash32;
 +              int              index;
 +
 +              hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
 +
 +              index = compute_modulo(hash32, self->nodeCount);
 +              switch (self->listType)
 +              {
 +                      case LOCATOR_LIST_NONE:
 +                              ((int *) self->results)[0] = index;
 +                              break;
 +                      case LOCATOR_LIST_INT:
 +                              ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
 +                              break;
 +                      case LOCATOR_LIST_OID:
 +                              ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
 +                              break;
 +                      case LOCATOR_LIST_POINTER:
 +                              ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
 +                              break;
 +                      case LOCATOR_LIST_LIST:
 +                              /* Should never happen */
 +                              Assert(false);
 +                              break;
 +              }
 +              return 1;
 +      }
 +}
 +
 +
 +/*
 + * Use modulo of supplied value by nodeCount as an index
 + */
 +static int
 +locate_modulo_insert(Locator *self, Datum value, bool isnull,
 +                                 bool *hasprimary)
 +{
 +      int index;
 +      if (hasprimary)
 +              *hasprimary = false;
 +      if (isnull)
 +              index = 0;
 +      else
 +      {
 +              unsigned int mod32;
 +
 +              if (self->valuelen == 4)
 +                      mod32 = (unsigned int) (GET_4_BYTES(value));
 +              else if (self->valuelen == 2)
 +                      mod32 = (unsigned int) (GET_2_BYTES(value));
 +              else if (self->valuelen == 1)
 +                      mod32 = (unsigned int) (GET_1_BYTE(value));
 +              else
 +                      mod32 = 0;
 +
 +              index = compute_modulo(mod32, self->nodeCount);
 +      }
 +      switch (self->listType)
 +      {
 +              case LOCATOR_LIST_NONE:
 +                      ((int *) self->results)[0] = index;
 +                      break;
 +              case LOCATOR_LIST_INT:
 +                      ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
 +                      break;
 +              case LOCATOR_LIST_OID:
 +                      ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
 +                      break;
 +              case LOCATOR_LIST_POINTER:
 +                      ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
 +                      break;
 +              case LOCATOR_LIST_LIST:
 +                      /* Should never happen */
 +                      Assert(false);
 +                      break;
 +      }
 +      return 1;
 +}
 +
 +
 +/*
 + * Use modulo of supplied value by nodeCount as an index
 + * if value is NULL assume no hint and return all the nodes.
 + */
 +static int
 +locate_modulo_select(Locator *self, Datum value, bool isnull,
 +                                 bool *hasprimary)
 +{
 +      if (hasprimary)
 +              *hasprimary = false;
 +      if (isnull)
 +      {
 +              int i;
 +              switch (self->listType)
 +              {
 +                      case LOCATOR_LIST_NONE:
 +                              for (i = 0; i < self->nodeCount; i++)
 +                                      ((int *) self->results)[i] = i;
 +                              break;
 +                      case LOCATOR_LIST_INT:
 +                              memcpy(self->results, self->nodeMap,
 +                                         self->nodeCount * sizeof(int));
 +                              break;
 +                      case LOCATOR_LIST_OID:
 +                              memcpy(self->results, self->nodeMap,
 +                                         self->nodeCount * sizeof(Oid));
 +                              break;
 +                      case LOCATOR_LIST_POINTER:
 +                              memcpy(self->results, self->nodeMap,
 +                                         self->nodeCount * sizeof(void *));
 +                              break;
 +                      case LOCATOR_LIST_LIST:
 +                              /* Should never happen */
 +                              Assert(false);
 +                              break;
 +              }
 +              return self->nodeCount;
 +      }
 +      else
 +      {
 +              unsigned int mod32;
 +              int              index;
 +
 +              if (self->valuelen == 4)
 +                      mod32 = (unsigned int) (GET_4_BYTES(value));
 +              else if (self->valuelen == 2)
 +                      mod32 = (unsigned int) (GET_2_BYTES(value));
 +              else if (self->valuelen == 1)
 +                      mod32 = (unsigned int) (GET_1_BYTE(value));
 +              else
 +                      mod32 = 0;
 +
 +              index = compute_modulo(mod32, self->nodeCount);
 +
 +              switch (self->listType)
 +              {
 +                      case LOCATOR_LIST_NONE:
 +                              ((int *) self->results)[0] = index;
 +                              break;
 +                      case LOCATOR_LIST_INT:
 +                              ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
 +                              break;
 +                      case LOCATOR_LIST_OID:
 +                              ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
 +                              break;
 +                      case LOCATOR_LIST_POINTER:
 +                              ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
 +                              break;
 +                      case LOCATOR_LIST_LIST:
 +                              /* Should never happen */
 +                              Assert(false);
 +                              break;
 +              }
 +              return 1;
 +      }
 +}
 +
 +
 +int
 +GET_NODES(Locator *self, Datum value, bool isnull, bool *hasprimary)
 +{
 +      return (*self->locatefunc) (self, value, isnull, hasprimary);
 +}
 +
 +
 +void *
 +getLocatorResults(Locator *self)
 +{
 +      return self->results;
 +}
 +
 +
 +void *
 +getLocatorNodeMap(Locator *self)
 +{
 +      return self->nodeMap;
 +}
 +
 +
 +int
 +getLocatorNodeCount(Locator *self)
 +{
 +      return self->nodeCount;
 +}
 +#endif
 +
 +/*
 + * GetRelationNodes
 + *
 + * Get list of relation nodes
 + * If the table is replicated and we are reading, we can just pick one.
 + * If the table is partitioned, we apply partitioning column value, if possible.
 + *
 + * If the relation is partitioned, partValue will be applied if present
 + * (indicating a value appears for partitioning column), otherwise it
 + * is ignored.
 + *
 + * preferredNodes is only used when for replicated tables. If set, it will
 + * use one of the nodes specified if the table is replicated on it.
 + * This helps optimize for avoiding introducing additional nodes into the
 + * transaction.
 + *
 + * The returned List is a copy, so it should be freed when finished.
 + */
 +ExecNodes *
 +GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
 +                              bool isValueNull,
 +                              RelationAccessType accessType)
 +{
 +      ExecNodes       *exec_nodes;
 +      int                     *nodenums;
 +      int                     i, count;
 +      Locator         *locator;
 +      Oid typeOfValueForDistCol = InvalidOid;
 +
 +      if (rel_loc_info == NULL)
 +              return NULL;
 +
 +
 +      if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
 +      {
 +              /* A sufficient lock level needs to be taken at a higher level */
 +              Relation rel = relation_open(rel_loc_info->relid, NoLock);
 +              TupleDesc       tupDesc = RelationGetDescr(rel);
 +              Form_pg_attribute *attr = tupDesc->attrs;
 +              /* Get the hash type of relation */
 +              typeOfValueForDistCol = attr[rel_loc_info->partAttrNum - 1]->atttypid;
 +              relation_close(rel, NoLock);
 +      }
 +
 +      exec_nodes = makeNode(ExecNodes);
 +      exec_nodes->baselocatortype = rel_loc_info->locatorType;
 +      exec_nodes->accesstype = accessType;
 +
 +      locator = createLocator(rel_loc_info->locatorType,
 +                                                      accessType,
 +                                                      typeOfValueForDistCol,
 +                                                      LOCATOR_LIST_LIST,
 +                                                      0,
 +                                                      (void *)rel_loc_info->rl_nodeList,
 +                                                      (void **)&nodenums,
 +                                                      false);
 +      count = GET_NODES(locator, valueForDistCol, isValueNull, NULL);
 +
 +      for (i = 0; i < count; i++)
 +              exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodenums[i]);
 +
 +      freeLocator(locator);
 +      return exec_nodes;
 +}
 +
 +/*
 + * GetRelationNodesByQuals
 + * A wrapper around GetRelationNodes to reduce the node list by looking at the
 + * quals. varno is assumed to be the varno of reloid inside the quals. No check
 + * is made to see if that's correct.
 + */
 +ExecNodes *
 +GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
 +                      Index varno, Node *quals, RelationAccessType relaccess)
 +{
 +      Expr                    *distcol_expr = NULL;
 +      ExecNodes               *exec_nodes;
 +      Datum                   distcol_value;
 +      bool                    distcol_isnull;
 +
 +      if (!rel_loc_info)
 +              return NULL;
 +      /*
 +       * If the table distributed by value, check if we can reduce the Datanodes
 +       * by looking at the qualifiers for this relation
 +       */
 +      if (IsRelationDistributedByValue(rel_loc_info))
 +      {
 +              Oid             disttype = get_atttype(reloid, rel_loc_info->partAttrNum);
 +              int32   disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum);
 +              distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum,
 +                                                                                                      quals);
 +              /*
 +               * If the type of expression used to find the Datanode, is not same as
 +               * the distribution column type, try casting it. This is same as what
 +               * will happen in case of inserting that type of expression value as the
 +               * distribution column value.
 +               */
 +              if (distcol_expr)
 +              {
 +                      distcol_expr = (Expr *)coerce_to_target_type(NULL,
 +                                                                                                      (Node *)distcol_expr,
 +                                                                                                      exprType((Node *)distcol_expr),
 +                                                                                                      disttype, disttypmod,
 +                                                                                                      COERCION_ASSIGNMENT,
 +                                                                                                      COERCE_IMPLICIT_CAST, -1);
 +                      /*
 +                       * PGXC_FQS_TODO: We should set the bound parameters here, but we don't have
 +                       * PlannerInfo struct and we don't handle them right now.
 +                       * Even if constant expression mutator changes the expression, it will
 +                       * only simplify it, keeping the semantics same
 +                       */
 +                      distcol_expr = (Expr *)eval_const_expressions(NULL,
 +                                                                                                                      (Node *)distcol_expr);
 +              }
 +      }
 +
 +      if (distcol_expr && IsA(distcol_expr, Const))
 +      {
 +              Const *const_expr = (Const *)distcol_expr;
 +              distcol_value = const_expr->constvalue;
 +              distcol_isnull = const_expr->constisnull;
 +      }
 +      else
 +      {
 +              distcol_value = (Datum) 0;
 +              distcol_isnull = true;
 +      }
 +
 +      exec_nodes = GetRelationNodes(rel_loc_info, distcol_value,
 +                                                                                              distcol_isnull,
 +                                                                                              relaccess);
 +      return exec_nodes;
 +}
 +
 +/*
 + * GetRelationDistribColumn
 + * Return hash column name for relation or NULL if relation is not distributed.
 + */
 +char *
 +GetRelationDistribColumn(RelationLocInfo *locInfo)
 +{
 +      /* No relation, so simply leave */
 +      if (!locInfo)
 +              return NULL;
 +
 +      /* No distribution column if relation is not distributed with a key */
 +      if (!IsRelationDistributedByValue(locInfo))
 +              return NULL;
 +
 +      /* Return column name */
 +      return get_attname(locInfo->relid, locInfo->partAttrNum);
 +}
 +
 +/*
 + * pgxc_find_distcol_expr
 + * Search through the quals provided and find out an expression which will give
 + * us value of distribution column if exists in the quals. Say for a table
 + * tab1 (val int, val2 int) distributed by hash(val), a query "SELECT * FROM
 + * tab1 WHERE val = fn(x, y, z) and val2 = 3", fn(x,y,z) is the expression which
 + * decides the distribution column value in the rows qualified by this query.
 + * Hence return fn(x, y, z). But for a query "SELECT * FROM tab1 WHERE val =
 + * fn(x, y, z) || val2 = 3", there is no expression which decides the values
 + * distribution column val can take in the qualified rows. So, in such cases
 + * this function returns NULL.
 + */
 +static Expr *
 +pgxc_find_distcol_expr(Index varno,
 +                                         AttrNumber attrNum,
 +                                         Node *quals)
 +{
 +      List *lquals;
 +      ListCell *qual_cell;
 +
 +      /* If no quals, no distribution column expression */
 +      if (!quals)
 +              return NULL;
 +
 +      /* Convert the qualification into List if it's not already so */
 +      if (!IsA(quals, List))
 +              lquals = make_ands_implicit((Expr *)quals);
 +      else
 +              lquals = (List *)quals;
 +
 +      /*
 +       * For every ANDed expression, check if that expression is of the form
 +       * <distribution_col> = <expr>. If so return expr.
 +       */
 +      foreach(qual_cell, lquals)
 +      {
 +              Expr *qual_expr = (Expr *)lfirst(qual_cell);
 +              OpExpr *op;
 +              Expr *lexpr;
 +              Expr *rexpr;
 +              Var *var_expr;
 +              Expr *distcol_expr;
 +
 +              if (!IsA(qual_expr, OpExpr))
 +                      continue;
 +              op = (OpExpr *)qual_expr;
 +              /* If not a binary operator, it can not be '='. */
 +              if (list_length(op->args) != 2)
 +                      continue;
 +
 +              lexpr = linitial(op->args);
 +              rexpr = lsecond(op->args);
 +
 +              /*
 +               * If either of the operands is a RelabelType, extract the Var in the RelabelType.
 +               * A RelabelType represents a "dummy" type coercion between two binary compatible datatypes.
 +               * If we do not handle these then our optimization does not work in case of varchar
 +               * For example if col is of type varchar and is the dist key then
 +               * select * from vc_tab where col = 'abcdefghijklmnopqrstuvwxyz';
 +               * should be shipped to one of the nodes only
 +               */
 +              if (IsA(lexpr, RelabelType))
 +                      lexpr = ((RelabelType*)lexpr)->arg;
 +              if (IsA(rexpr, RelabelType))
 +                      rexpr = ((RelabelType*)rexpr)->arg;
 +
 +              /*
 +               * If either of the operands is a Var expression, assume the other
 +               * one is distribution column expression. If none is Var check next
 +               * qual.
 +               */
 +              if (IsA(lexpr, Var))
 +              {
 +                      var_expr = (Var *)lexpr;
 +                      distcol_expr = rexpr;
 +              }
 +              else if (IsA(rexpr, Var))
 +              {
 +                      var_expr = (Var *)rexpr;
 +                      distcol_expr = lexpr;
 +              }
 +              else
 +                      continue;
 +              /*
 +               * If Var found is not the distribution column of required relation,
 +               * check next qual
 +               */
 +              if (var_expr->varno != varno || var_expr->varattno != attrNum)
 +                      continue;
 +              /*
 +               * If the operator is not an assignment operator, check next
 +               * constraint. An operator is an assignment operator if it's
 +               * mergejoinable or hashjoinable. Beware that not every assignment
 +               * operator is mergejoinable or hashjoinable, so we might leave some
 +               * oportunity. But then we have to rely on the opname which may not
 +               * be something we know to be equality operator as well.
 +               */
 +              if (!op_mergejoinable(op->opno, exprType((Node *)lexpr)) &&
 +                      !op_hashjoinable(op->opno, exprType((Node *)lexpr)))
 +                      continue;
 +              /* Found the distribution column expression return it */
 +              return distcol_expr;
 +      }
 +      /* Exhausted all quals, but no distribution column expression */
 +      return NULL;
 +}
index b63b8cf44fd0c118df7daefb64acae4a79cd16c8,0000000000000000000000000000000000000000..8104e5ba4c5c8c0705efb78335d9fb4e24daf79b
mode 100644,000000..100644
--- /dev/null
@@@ -1,156 -1,0 +1,153 @@@
-       /* Do the insertion */
-       (void) simple_heap_insert(rel, tup);
-       CatalogUpdateIndexes(rel, tup);
 +/*-------------------------------------------------------------------------
 + *
 + * groupmgr.c
 + *      Routines to support manipulation of the pgxc_group catalog
 + *      This includes support for DDL on objects NODE GROUP
 + *
 + * Copyright (c) 1996-2010, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include "postgres.h"
 +#include "miscadmin.h"
 +
 +#include "access/heapam.h"
 +#include "access/htup_details.h"
 +#include "catalog/catalog.h"
 +#include "catalog/indexing.h"
 +#include "catalog/pg_type.h"
 +#include "catalog/pgxc_node.h"
 +#include "catalog/pgxc_group.h"
 +#include "nodes/parsenodes.h"
 +#include "nodes/pg_list.h"
 +#include "utils/builtins.h"
 +#include "utils/rel.h"
 +#include "utils/syscache.h"
 +#include "utils/lsyscache.h"
 +#include "utils/array.h"
 +#include "pgxc/groupmgr.h"
 +
 +/*
 + * PgxcGroupCreate
 + *
 + * Create a PGXC node group
 + */
 +void
 +PgxcGroupCreate(CreateGroupStmt *stmt)
 +{
 +      const char *group_name = stmt->group_name;
 +      List       *nodes = stmt->nodes;
 +      oidvector  *nodes_array;
 +      Oid                *inTypes;
 +      Relation        rel;
 +      HeapTuple       tup;
 +      bool            nulls[Natts_pgxc_group];
 +      Datum           values[Natts_pgxc_group];
 +      int                     member_count = list_length(stmt->nodes);
 +      ListCell   *lc;
 +      int                     i = 0;
 +
 +      /* Only a DB administrator can add cluster node groups */
 +      if (!superuser())
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 +                               errmsg("must be superuser to create cluster node groups")));
 +
 +      /* Check if given group already exists */
 +      if (OidIsValid(get_pgxc_groupoid(group_name)))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DUPLICATE_OBJECT),
 +                               errmsg("PGXC Group %s: group already defined",
 +                                              group_name)));
 +
 +      inTypes = (Oid *) palloc(member_count * sizeof(Oid));
 +
 +      /* Build list of Oids for each node listed */
 +      foreach(lc, nodes)
 +      {
 +              char   *node_name = strVal(lfirst(lc));
 +              Oid     noid = get_pgxc_nodeoid(node_name);
 +
 +              if (!OidIsValid(noid))
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                                       errmsg("PGXC Node %s: object not defined",
 +                                                      node_name)));
 +
 +              if (get_pgxc_nodetype(noid) != PGXC_NODE_DATANODE)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                       errmsg("PGXC node %s: only Datanodes can be group members",
 +                                                      node_name)));
 +
 +              /* OK to pick up Oid of this node */
 +              inTypes[i] = noid;
 +              i++;
 +      }
 +
 +      /* Build array of Oids to be inserted */
 +      nodes_array = buildoidvector(inTypes, member_count);
 +
 +      /* Iterate through all attributes initializing nulls and values */
 +      for (i = 0; i < Natts_pgxc_group; i++)
 +      {
 +              nulls[i]  = false;
 +              values[i] = (Datum) 0;
 +      }
 +
 +      /* Insert Data correctly */
 +      values[Anum_pgxc_group_name - 1] =
 +              DirectFunctionCall1(namein, CStringGetDatum(group_name));
 +      values[Anum_pgxc_group_members - 1] = PointerGetDatum(nodes_array);
 +
 +      /* Open the relation for insertion */
 +      rel = heap_open(PgxcGroupRelationId, RowExclusiveLock);
 +      tup = heap_form_tuple(rel->rd_att, values, nulls);
 +
++      CatalogTupleInsert(rel, tup);
 +
 +      heap_close(rel, RowExclusiveLock);
 +}
 +
 +
 +/*
 + * PgxcNodeGroupsRemove():
 + *
 + * Remove a PGXC node group
 + */
 +void
 +PgxcGroupRemove(DropGroupStmt *stmt)
 +{
 +      Relation        relation;
 +      HeapTuple       tup;
 +      const char *group_name = stmt->group_name;
 +      Oid                     group_oid = get_pgxc_groupoid(group_name);
 +
 +      /* Only a DB administrator can remove cluster node groups */
 +      if (!superuser())
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 +                               errmsg("must be superuser to remove cluster node groups")));
 +
 +      /* Check if group exists */
 +      if (!OidIsValid(group_oid))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DUPLICATE_OBJECT),
 +                               errmsg("PGXC Group %s: group not defined",
 +                                              group_name)));
 +
 +      /* Delete the pgxc_group tuple */
 +      relation = heap_open(PgxcGroupRelationId, RowExclusiveLock);
 +      tup = SearchSysCache(PGXCGROUPOID, ObjectIdGetDatum(group_oid), 0, 0, 0);
 +
 +      if (!HeapTupleIsValid(tup)) /* should not happen */
 +              elog(ERROR, "PGXC Group %s: group not defined", group_name);
 +
 +      simple_heap_delete(relation, &tup->t_self);
 +
 +      ReleaseSysCache(tup);
 +
 +      heap_close(relation, RowExclusiveLock);
 +}
index 5ae6fe5f0567ca210006a015d92f69f8ebbdd1f8,0000000000000000000000000000000000000000..e6cc9af14b438f664d068cd244af04c2dd0148c7
mode 100644,000000..100644
--- /dev/null
@@@ -1,1032 -1,0 +1,1028 @@@
-       /* Insert tuple in catalog */
-       simple_heap_insert(pgxcnodesrel, htup);
-       CatalogUpdateIndexes(pgxcnodesrel, htup);
 +/*-------------------------------------------------------------------------
 + *
 + * nodemgr.c
 + *      Routines to support manipulation of the pgxc_node catalog
 + *      Support concerns CREATE/ALTER/DROP on NODE object.
 + *
 + * Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include "postgres.h"
 +#include "miscadmin.h"
 +
 +#include "access/hash.h"
 +#include "access/heapam.h"
 +#include "access/htup_details.h"
 +#include "catalog/catalog.h"
 +#include "catalog/indexing.h"
 +#include "catalog/pgxc_node.h"
 +#include "commands/defrem.h"
 +#include "nodes/parsenodes.h"
 +#include "utils/builtins.h"
 +#include "utils/rel.h"
 +#include "utils/syscache.h"
 +#include "utils/lsyscache.h"
 +#include "utils/tqual.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/pgxc.h"
++#include "storage/lwlock.h"
++#include "storage/shmem.h"
 +
 +/*
 + * How many times should we try to find a unique indetifier
 + * in case hash of the node name comes out to be duplicate
 + */
 +
 +#define MAX_TRIES_FOR_NID     200
 +
 +static Datum generate_node_id(const char *node_name);
 +static void count_coords_datanodes(Relation rel, int *num_coord, int *num_dns);
 +
 +/*
 + * GUC parameters.
 + * Shared memory block can not be resized dynamically, so we should have some
 + * limits set at startup time to calculate amount of shared memory to store
 + * node table. Nodes can be added to running cluster until that limit is reached
 + * if cluster needs grow beyond the configuration value should be changed and
 + * cluster restarted.
 + */
 +int                           MaxCoords = 16;
 +int                           MaxDataNodes = 16;
 +
 +/* Global number of nodes. Point to a shared memory block */
 +static int       *shmemNumCoords;
 +static int       *shmemNumDataNodes;
 +
 +/* Shared memory tables of node definitions */
 +NodeDefinition *coDefs;
 +NodeDefinition *dnDefs;
 +
 +/*
 + * NodeTablesInit
 + *    Initializes shared memory tables of Coordinators and Datanodes.
 + */
 +void
 +NodeTablesShmemInit(void)
 +{
 +      bool found;
 +      int i;
 +
 +      /*
 +       * Initialize the table of Coordinators: first sizeof(int) bytes are to
 +       * store actual number of Coordinators, remaining data in the structure is
 +       * array of NodeDefinition that can contain up to MaxCoords entries.
 +       * That is a bit weird and probably it would be better have these in
 +       * separate structures, but I am unsure about cost of having shmem structure
 +       * containing just single integer.
 +       */
 +      shmemNumCoords = ShmemInitStruct("Coordinator Table",
 +                                                              sizeof(int) +
 +                                                                      sizeof(NodeDefinition) * MaxCoords,
 +                                                              &found);
 +
 +      /* Have coDefs pointing right behind shmemNumCoords */
 +      coDefs = (NodeDefinition *) (shmemNumCoords + 1);
 +
 +      /* Mark it empty upon creation */
 +      if (!found)
 +      {
 +              *shmemNumCoords = 0;
 +              /* Mark nodeishealthy true at init time for all */
 +              for (i = 0; i < MaxCoords; i++)
 +                      coDefs[i].nodeishealthy = true;
 +      }
 +
 +      /* Same for Datanodes */
 +      shmemNumDataNodes = ShmemInitStruct("Datanode Table",
 +                                                                 sizeof(int) +
 +                                                                         sizeof(NodeDefinition) * MaxDataNodes,
 +                                                                 &found);
 +
 +      /* Have dnDefs pointing right behind shmemNumDataNodes */
 +      dnDefs = (NodeDefinition *) (shmemNumDataNodes + 1);
 +
 +      /* Mark it empty upon creation */
 +      if (!found)
 +      {
 +              *shmemNumDataNodes = 0;
 +              /* Mark nodeishealthy true at init time for all */
 +              for (i = 0; i < MaxDataNodes; i++)
 +                      dnDefs[i].nodeishealthy = true;
 +      }
 +}
 +
 +
 +/*
 + * NodeTablesShmemSize
 + *    Get the size of shared memory dedicated to node definitions
 + */
 +Size
 +NodeTablesShmemSize(void)
 +{
 +      Size co_size;
 +      Size dn_size;
 +
 +      co_size = mul_size(sizeof(NodeDefinition), MaxCoords);
 +      co_size = add_size(co_size, sizeof(int));
 +      dn_size = mul_size(sizeof(NodeDefinition), MaxDataNodes);
 +      dn_size = add_size(dn_size, sizeof(int));
 +
 +      return add_size(co_size, dn_size);
 +}
 +
 +/*
 + * Check list of options and return things filled.
 + * This includes check on option values.
 + */
 +static void
 +check_node_options(const char *node_name, List *options, char **node_host,
 +                      int *node_port, char *node_type,
 +                      bool *is_primary, bool *is_preferred)
 +{
 +      ListCell   *option;
 +
 +      if (!options)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("No options specified")));
 +
 +      /* Filter options */
 +      foreach(option, options)
 +      {
 +              DefElem    *defel = (DefElem *) lfirst(option);
 +
 +              if (strcmp(defel->defname, "port") == 0)
 +              {
 +                      *node_port = defGetTypeLength(defel);
 +
 +                      if (*node_port < 1 || *node_port > 65535)
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
 +                                               errmsg("port value is out of range")));
 +              }
 +              else if (strcmp(defel->defname, "host") == 0)
 +              {
 +                      *node_host = defGetString(defel);
 +              }
 +              else if (strcmp(defel->defname, "type") == 0)
 +              {
 +                      char *type_loc;
 +
 +                      type_loc = defGetString(defel);
 +
 +                      if (strcmp(type_loc, "coordinator") != 0 &&
 +                              strcmp(type_loc, "datanode") != 0)
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 +                                               errmsg("type value is incorrect, specify 'coordinator or 'datanode'")));
 +
 +                      if (strcmp(type_loc, "coordinator") == 0)
 +                              *node_type = PGXC_NODE_COORDINATOR;
 +                      else
 +                              *node_type = PGXC_NODE_DATANODE;
 +              }
 +              else if (strcmp(defel->defname, "primary") == 0)
 +              {
 +                      *is_primary = defGetBoolean(defel);
 +              }
 +              else if (strcmp(defel->defname, "preferred") == 0)
 +              {
 +                      *is_preferred = defGetBoolean(defel);
 +              }
 +              else
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                       errmsg("incorrect option: %s", defel->defname)));
 +              }
 +      }
 +
 +      /* A primary node has to be a Datanode */
 +      if (*is_primary && *node_type != PGXC_NODE_DATANODE)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("PGXC node %s: cannot be a primary node, it has to be a Datanode",
 +                                              node_name)));
 +
 +      /* A preferred node has to be a Datanode */
 +      if (*is_preferred && *node_type != PGXC_NODE_DATANODE)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("PGXC node %s: cannot be a preferred node, it has to be a Datanode",
 +                                              node_name)));
 +
 +      /* Node type check */
 +      if (*node_type == PGXC_NODE_NONE)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("PGXC node %s: Node type not specified",
 +                                              node_name)));
 +
 +#ifdef XCP
 +      if (*node_type == PGXC_NODE_DATANODE && NumDataNodes >= MaxDataNodes)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 +                               errmsg("Too many datanodes, current value of max_datanodes is %d",
 +                                              MaxDataNodes)));
 +
 +#endif
 +}
 +
 +/*
 + * generate_node_id
 + *
 + * Given a node name compute its hash to generate the identifier
 + * If the hash comes out to be duplicate , try some other values
 + * Give up after a few tries
 + */
 +static Datum
 +generate_node_id(const char *node_name)
 +{
 +      Datum           node_id;
 +      uint32          n;
 +      bool            inc;
 +      int             i;
 +
 +      /* Compute node identifier by computing hash of node name */
 +      node_id = hash_any((unsigned char *)node_name, strlen(node_name));
 +
 +      /*
 +       * Check if the hash is near the overflow limit, then we will
 +       * decrement it , otherwise we will increment
 +       */
 +      inc = true;
 +      n = DatumGetUInt32(node_id);
 +      if (n >= UINT_MAX - MAX_TRIES_FOR_NID)
 +              inc = false;
 +
 +      /*
 +       * Check if the identifier is clashing with an existing one,
 +       * and if it is try some other
 +       */
 +      for (i = 0; i < MAX_TRIES_FOR_NID; i++)
 +      {
 +              HeapTuple       tup;
 +
 +              tup = SearchSysCache1(PGXCNODEIDENTIFIER, node_id);
 +              if (tup == NULL)
 +                      break;
 +
 +              ReleaseSysCache(tup);
 +
 +              n = DatumGetUInt32(node_id);
 +              if (inc)
 +                      n++;
 +              else
 +                      n--;
 +
 +              node_id = UInt32GetDatum(n);
 +      }
 +
 +      /*
 +       * This has really few chances to happen, but inform backend that node
 +       * has not been registered correctly in this case.
 +       */
 +      if (i >= MAX_TRIES_FOR_NID)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 +                               errmsg("Please choose different node name."),
 +                               errdetail("Name \"%s\" produces a duplicate identifier node_name",
 +                                                 node_name)));
 +
 +      return node_id;
 +}
 +
 +/* --------------------------------
 + *  cmp_nodes
 + *
 + *  Compare the Oids of two XC nodes
 + *  to sort them in ascending order by their names
 + * --------------------------------
 + */
 +static int
 +cmp_nodes(const void *p1, const void *p2)
 +{
 +      Oid n1 = *((Oid *)p1);
 +      Oid n2 = *((Oid *)p2);
 +
 +      if (strcmp(get_pgxc_nodename(n1), get_pgxc_nodename(n2)) < 0)
 +              return -1;
 +
 +      if (strcmp(get_pgxc_nodename(n1), get_pgxc_nodename(n2)) == 0)
 +              return 0;
 +
 +      return 1;
 +}
 +
 +/*
 + * Count the number of coordinators and datanodes configured so far.
 + */
 +static void
 +count_coords_datanodes(Relation rel, int *num_coord, int *num_dns)
 +{
 +      int                     coordCount = 0, dnCount = 0;
 +      HeapScanDesc scan;
 +      HeapTuple   tuple;
 +
 +      scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
 +      while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 +      {
 +              Form_pgxc_node  nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +
 +              /* Take definition for given node type */
 +              switch (nodeForm->node_type)
 +              {
 +                      case PGXC_NODE_COORDINATOR:
 +                              coordCount++;
 +                              break;
 +                      case PGXC_NODE_DATANODE:
 +                              dnCount++;
 +                              break;
 +                      default:
 +                              break;
 +              }
 +      }
 +      heap_endscan(scan);
 +
 +      *num_coord = coordCount;
 +      *num_dns = dnCount;
 +}
 +
 +/*
 + * PgxcNodeListAndCount
 + *
 + * Update node definitions in the shared memory tables from the catalog
 + */
 +void
 +PgxcNodeListAndCount(void)
 +{
 +      Relation rel;
 +      HeapScanDesc scan;
 +      HeapTuple   tuple;
 +      NodeDefinition *nodes = NULL;
 +      int     numNodes;
 +
 +      LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
 +
 +      numNodes = *shmemNumCoords + *shmemNumDataNodes;
 +
 +      Assert((*shmemNumCoords >= 0) && (*shmemNumDataNodes >= 0));
 +
 +      /*
 +       * Save the existing health status values because nodes
 +       * might get added or deleted here. We will save
 +       * nodeoid, status. No need to differentiate between
 +       * coords and datanodes since oids will be unique anyways
 +       */
 +      if (numNodes > 0)
 +      {
 +              nodes = (NodeDefinition*)palloc(numNodes * sizeof(NodeDefinition));
 +
 +              /* XXX It's possible to call memcpy with */
 +              if (*shmemNumCoords > 0)
 +                      memcpy(nodes, coDefs, *shmemNumCoords * sizeof(NodeDefinition));
 +
 +              if (*shmemNumDataNodes > 0)
 +                      memcpy(nodes + *shmemNumCoords, dnDefs,
 +                                 *shmemNumDataNodes * sizeof(NodeDefinition));
 +      }
 +
 +      *shmemNumCoords = 0;
 +      *shmemNumDataNodes = 0;
 +
 +      /*
 +       * Node information initialization is made in one scan:
 +       * 1) Scan pgxc_node catalog to find the number of nodes for
 +       *      each node type and make proper allocations
 +       * 2) Then extract the node Oid
 +       * 3) Complete primary/preferred node information
 +       */
 +      rel = heap_open(PgxcNodeRelationId, AccessShareLock);
 +      scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
 +      while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 +      {
 +              Form_pgxc_node  nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +              NodeDefinition *node;
 +              int i;
 +
 +              /* Take definition for given node type */
 +              switch (nodeForm->node_type)
 +              {
 +                      case PGXC_NODE_COORDINATOR:
 +                              node = &coDefs[(*shmemNumCoords)++];
 +                              break;
 +                      case PGXC_NODE_DATANODE:
 +                      default:
 +                              node = &dnDefs[(*shmemNumDataNodes)++];
 +                              break;
 +              }
 +
 +              /* Populate the definition */
 +              node->nodeoid = HeapTupleGetOid(tuple);
 +              memcpy(&node->nodename, &nodeForm->node_name, NAMEDATALEN);
 +              memcpy(&node->nodehost, &nodeForm->node_host, NAMEDATALEN);
 +              node->nodeport = nodeForm->node_port;
 +              node->nodeisprimary = nodeForm->nodeis_primary;
 +              node->nodeispreferred = nodeForm->nodeis_preferred;
 +              /*
 +               * Copy over the health status from above for nodes that
 +               * existed before and after the refresh. If we do not find
 +               * entry for a nodeoid, we mark it as healthy
 +               */
 +              node->nodeishealthy = true;
 +              for (i = 0; i < numNodes; i++)
 +              {
 +                      if (nodes[i].nodeoid == node->nodeoid)
 +                      {
 +                              node->nodeishealthy = nodes[i].nodeishealthy;
 +                              break;
 +                      }
 +              }
 +      }
 +      heap_endscan(scan);
 +      heap_close(rel, AccessShareLock);
 +
 +      elog(DEBUG1, "Done pgxc_nodes scan: %d coordinators and %d datanodes",
 +                      *shmemNumCoords, *shmemNumDataNodes);
 +
 +      if (numNodes)
 +              pfree(nodes);
 +
 +      /* Finally sort the lists */
 +      if (*shmemNumCoords > 1)
 +              qsort(coDefs, *shmemNumCoords, sizeof(NodeDefinition), cmp_nodes);
 +      if (*shmemNumDataNodes > 1)
 +              qsort(dnDefs, *shmemNumDataNodes, sizeof(NodeDefinition), cmp_nodes);
 +
 +      LWLockRelease(NodeTableLock);
 +}
 +
 +
 +/*
 + * PgxcNodeGetIds
 + *
 + * List into palloc'ed arrays Oids of Coordinators and Datanodes currently
 + * presented in the node table, as well as number of Coordinators and Datanodes.
 + * Any parameter may be NULL if caller is not interested in receiving
 + * appropriate results. Preferred and primary node information can be updated
 + * in session if requested.
 + */
 +void
 +PgxcNodeGetOids(Oid **coOids, Oid **dnOids,
 +                              int *num_coords, int *num_dns, bool update_preferred)
 +{
 +      LWLockAcquire(NodeTableLock, LW_SHARED);
 +
 +      elog(DEBUG1, "Get OIDs from table: %d coordinators and %d datanodes",
 +                      *shmemNumCoords, *shmemNumDataNodes);
 +
 +      if (num_coords)
 +              *num_coords = *shmemNumCoords;
 +      if (num_dns)
 +              *num_dns = *shmemNumDataNodes;
 +
 +      if (coOids)
 +      {
 +              int i;
 +
 +              *coOids = (Oid *) palloc(*shmemNumCoords * sizeof(Oid));
 +              for (i = 0; i < *shmemNumCoords; i++)
 +                      (*coOids)[i] = coDefs[i].nodeoid;
 +      }
 +
 +      if (dnOids)
 +      {
 +              int i;
 +
 +              *dnOids = (Oid *) palloc(*shmemNumDataNodes * sizeof(Oid));
 +              for (i = 0; i < *shmemNumDataNodes; i++)
 +                      (*dnOids)[i] = dnDefs[i].nodeoid;
 +      }
 +
 +      /* Update also preferred and primary node informations if requested */
 +      if (update_preferred)
 +      {
 +              int i;
 +
 +              /* Initialize primary and preferred node information */
 +              primary_data_node = InvalidOid;
 +              num_preferred_data_nodes = 0;
 +
 +              for (i = 0; i < *shmemNumDataNodes; i++)
 +              {
 +                      if (dnDefs[i].nodeisprimary)
 +                              primary_data_node = dnDefs[i].nodeoid;
 +
 +                      if (dnDefs[i].nodeispreferred)
 +                      {
 +                              preferred_data_node[num_preferred_data_nodes] = dnDefs[i].nodeoid;
 +                              num_preferred_data_nodes++;
 +                      }
 +              }
 +      }
 +
 +      LWLockRelease(NodeTableLock);
 +}
 +
 +/*
 + * PgxcNodeGetHealthMap
 + *
 + * List into palloc'ed arrays Oids of Coordinators and Datanodes currently
 + * presented in the node table, as well as number of Coordinators and Datanodes.
 + * Any parameter may be NULL if caller is not interested in receiving
 + * appropriate results for either the Coordinators or Datanodes.
 + */
 +void
 +PgxcNodeGetHealthMap(Oid *coOids, Oid *dnOids,
 +                              int *num_coords, int *num_dns, bool *coHealthMap,
 +                              bool *dnHealthMap)
 +{
 +      elog(DEBUG1, "Get HealthMap from table: %d coordinators and %d datanodes",
 +                      *shmemNumCoords, *shmemNumDataNodes);
 +
 +      LWLockAcquire(NodeTableLock, LW_SHARED);
 +
 +      if (num_coords)
 +              *num_coords = *shmemNumCoords;
 +      if (num_dns)
 +              *num_dns = *shmemNumDataNodes;
 +
 +      if (coOids)
 +      {
 +              int i;
 +              for (i = 0; i < *shmemNumCoords; i++)
 +              {
 +                      coOids[i] = coDefs[i].nodeoid;
 +                      if (coHealthMap)
 +                              coHealthMap[i] = coDefs[i].nodeishealthy;
 +              }
 +      }
 +
 +      if (dnOids)
 +      {
 +              int i;
 +
 +              for (i = 0; i < *shmemNumDataNodes; i++)
 +              {
 +                      dnOids[i] = dnDefs[i].nodeoid;
 +                      if (dnHealthMap)
 +                              dnHealthMap[i] = dnDefs[i].nodeishealthy;
 +              }
 +      }
 +
 +      LWLockRelease(NodeTableLock);
 +}
 +
 +/*
 + * Consult the shared memory NodeDefinition structures and
 + * fetch the nodeishealthy value and return it back
 + *
 + * We will probably need a similar function for coordinators
 + * in the future..
 + */
 +void
 +PgxcNodeDnListHealth(List *nodeList, bool *healthmap)
 +{
 +      ListCell *lc;
 +      int index = 0;
 +
 +      elog(DEBUG1, "Get healthmap from datanodeList");
 +
 +      if (!nodeList || !list_length(nodeList))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                               errmsg("NIL or empty nodeList passed")));
 +
 +      LWLockAcquire(NodeTableLock, LW_SHARED);
 +      foreach(lc, nodeList)
 +      {
 +              int node = lfirst_int(lc);
 +
 +              if (node >= *shmemNumDataNodes)
 +              {
 +                      LWLockRelease(NodeTableLock);
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                                       errmsg("PGXC health status not found for datanode with oid (%d)",
 +                                               node)));
 +              }
 +              healthmap[index++] = dnDefs[node].nodeishealthy;
 +      }
 +      LWLockRelease(NodeTableLock);
 +}
 +
 +/*
 + * Find node definition in the shared memory node table.
 + * The structure is a copy palloc'ed in current memory context.
 + */
 +NodeDefinition *
 +PgxcNodeGetDefinition(Oid node)
 +{
 +      NodeDefinition *result = NULL;
 +      int                             i;
 +
 +      LWLockAcquire(NodeTableLock, LW_SHARED);
 +
 +      /* search through the Datanodes first */
 +      for (i = 0; i < *shmemNumDataNodes; i++)
 +      {
 +              if (dnDefs[i].nodeoid == node)
 +              {
 +                      result = (NodeDefinition *) palloc(sizeof(NodeDefinition));
 +
 +                      memcpy(result, dnDefs + i, sizeof(NodeDefinition));
 +
 +                      LWLockRelease(NodeTableLock);
 +
 +                      return result;
 +              }
 +      }
 +
 +      /* if not found, search through the Coordinators */
 +      for (i = 0; i < *shmemNumCoords; i++)
 +      {
 +              if (coDefs[i].nodeoid == node)
 +              {
 +                      result = (NodeDefinition *) palloc(sizeof(NodeDefinition));
 +
 +                      memcpy(result, coDefs + i, sizeof(NodeDefinition));
 +
 +                      LWLockRelease(NodeTableLock);
 +
 +                      return result;
 +              }
 +      }
 +
 +      /* not found, return NULL */
 +      LWLockRelease(NodeTableLock);
 +      return NULL;
 +}
 +
 +/*
 + * Update health status of a node in the shared memory node table.
 + *
 + * We could try to optimize this by checking if the ishealthy value
 + * is already the same as the passed in one.. but if the cluster is
 + * impaired, dunno how much such optimizations are worth. So keeping
 + * it simple for now
 + */
 +bool
 +PgxcNodeUpdateHealth(Oid node, bool status)
 +{
 +      int                             i;
 +
 +      LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
 +
 +      /* search through the Datanodes first */
 +      for (i = 0; i < *shmemNumDataNodes; i++)
 +      {
 +              if (dnDefs[i].nodeoid == node)
 +              {
 +                      dnDefs[i].nodeishealthy = status;
 +
 +                      LWLockRelease(NodeTableLock);
 +
 +                      return true;
 +              }
 +      }
 +
 +      /* if not found, search through the Coordinators */
 +      for (i = 0; i < *shmemNumCoords; i++)
 +      {
 +              if (coDefs[i].nodeoid == node)
 +              {
 +                      coDefs[i].nodeishealthy = status;
 +
 +                      LWLockRelease(NodeTableLock);
 +
 +                      return true;
 +              }
 +      }
 +
 +      /* not found, return false */
 +      LWLockRelease(NodeTableLock);
 +      return false;
 +}
 +
 +/*
 + * PgxcNodeCreate
 + *
 + * Add a PGXC node
 + */
 +void
 +PgxcNodeCreate(CreateNodeStmt *stmt)
 +{
 +      Relation        pgxcnodesrel;
 +      HeapTuple       htup;
 +      bool            nulls[Natts_pgxc_node];
 +      Datum           values[Natts_pgxc_node];
 +      const char *node_name = stmt->node_name;
 +      int             i;
 +      /* Options with default values */
 +      char       *node_host = NULL;
 +      char            node_type = PGXC_NODE_NONE;
 +      int                     node_port = 0;
 +      bool            is_primary = false;
 +      bool            is_preferred = false;
 +      Datum           node_id;
 +      int                     coordCount = 0, dnCount = 0;
 +
 +      /* Only a DB administrator can add nodes */
 +      if (!superuser())
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 +                               errmsg("must be superuser to create cluster nodes")));
 +
 +      /* Check that node name is node in use */
 +      if (OidIsValid(get_pgxc_nodeoid(node_name)))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DUPLICATE_OBJECT),
 +                               errmsg("PGXC Node %s: object already defined",
 +                                              node_name)));
 +
 +      /* Check length of node name */
 +      if (strlen(node_name) > PGXC_NODENAME_LENGTH)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 +                               errmsg("Node name \"%s\" is too long",
 +                                              node_name)));
 +
 +      /* Filter options */
 +      check_node_options(node_name, stmt->options, &node_host,
 +                              &node_port, &node_type,
 +                              &is_primary, &is_preferred);
 +
 +      /* Compute node identifier */
 +      node_id = generate_node_id(node_name);
 +
 +      /*
 +       * Check that this node is not created as a primary if one already
 +       * exists.
 +       */
 +      if (is_primary && OidIsValid(primary_data_node))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("PGXC node %s: two nodes cannot be primary",
 +                                              node_name)));
 +
 +      /*
 +       * Then assign default values if necessary
 +       * First for port.
 +       */
 +      if (node_port == 0)
 +      {
 +              node_port = 5432;
 +              elog(DEBUG1, "PGXC node %s: Applying default port value: %d",
 +                       node_name, node_port);
 +      }
 +
 +      /* Then apply default value for host */
 +      if (!node_host)
 +      {
 +              node_host = strdup("localhost");
 +              elog(DEBUG1, "PGXC node %s: Applying default host value: %s",
 +                       node_name, node_host);
 +      }
 +
 +      /* Iterate through all attributes initializing nulls and values */
 +      for (i = 0; i < Natts_pgxc_node; i++)
 +      {
 +              nulls[i]  = false;
 +              values[i] = (Datum) 0;
 +      }
 +
 +      /*
 +       * Open the relation for insertion
 +       * This is necessary to generate a unique Oid for the new node
 +       * There could be a relation race here if a similar Oid
 +       * being created before the heap is inserted.
 +       */
 +      pgxcnodesrel = heap_open(PgxcNodeRelationId, AccessExclusiveLock);
 +
 +      /*
 +       * Get the count of datanodes and coordinators added so far and make sure
 +       * we're not exceeding the configured limits
 +       *
 +       * XXX This is not full proof because someone may first set
 +       * max_coordinators or max_datanodes to a high value, add nodes and then
 +       * lower the value again.
 +       */
 +      count_coords_datanodes(pgxcnodesrel, &coordCount, &dnCount);
 +
 +      if ((node_type == PGXC_NODE_DATANODE && dnCount >= MaxDataNodes) ||
 +              (node_type == PGXC_NODE_COORDINATOR && coordCount >= MaxCoords))
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
 +                               errmsg("cannot add more than %d %s",
 +                                       node_type == PGXC_NODE_COORDINATOR ?
 +                                       MaxCoords : MaxDataNodes,
 +                                       node_type == PGXC_NODE_COORDINATOR ?
 +                                       "coordinators" : "datanodes"),
 +                               errhint("increase the value of %s GUC and restart the cluster",
 +                                       node_type == PGXC_NODE_COORDINATOR ?
 +                                       "max_coordinators" : "max_datanodes"
 +                                       )));
 +
 +      }
 +
 +      /* Build entry tuple */
 +      values[Anum_pgxc_node_name - 1] = DirectFunctionCall1(namein, CStringGetDatum(node_name));
 +      values[Anum_pgxc_node_type - 1] = CharGetDatum(node_type);
 +      values[Anum_pgxc_node_port - 1] = Int32GetDatum(node_port);
 +      values[Anum_pgxc_node_host - 1] = DirectFunctionCall1(namein, CStringGetDatum(node_host));
 +      values[Anum_pgxc_node_is_primary - 1] = BoolGetDatum(is_primary);
 +      values[Anum_pgxc_node_is_preferred - 1] = BoolGetDatum(is_preferred);
 +      values[Anum_pgxc_node_id - 1] = node_id;
 +
 +      htup = heap_form_tuple(pgxcnodesrel->rd_att, values, nulls);
 +
-       simple_heap_update(rel, &oldtup->t_self, newtup);
-       /* Update indexes */
-       CatalogUpdateIndexes(rel, newtup);
++      CatalogTupleInsert(pgxcnodesrel, htup);
 +
 +      heap_close(pgxcnodesrel, AccessExclusiveLock);
 +}
 +
 +/*
 + * PgxcNodeAlter
 + *
 + * Alter a PGXC node
 + */
 +void
 +PgxcNodeAlter(AlterNodeStmt *stmt)
 +{
 +      const char *node_name = stmt->node_name;
 +      char       *node_host;
 +      char            node_type;
 +      int                     node_port;
 +      bool            is_preferred;
 +      bool            is_primary;
 +      HeapTuple       oldtup, newtup;
 +      Oid                     nodeOid = get_pgxc_nodeoid(node_name);
 +      Relation        rel;
 +      Datum           new_record[Natts_pgxc_node];
 +      bool            new_record_nulls[Natts_pgxc_node];
 +      bool            new_record_repl[Natts_pgxc_node];
 +      uint32          node_id;
 +      int                     coordCount = 0, dnCount = 0;
 +
 +      /* Only a DB administrator can alter cluster nodes */
 +      if (!superuser())
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 +                               errmsg("must be superuser to change cluster nodes")));
 +
 +      /* Look at the node tuple, and take exclusive lock on it */
 +      rel = heap_open(PgxcNodeRelationId, AccessExclusiveLock);
 +
 +      /* Check that node exists */
 +      if (!OidIsValid(nodeOid))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                               errmsg("PGXC Node %s: object not defined",
 +                                              node_name)));
 +
 +      /* Open new tuple, checks are performed on it and new values */
 +      oldtup = SearchSysCacheCopy1(PGXCNODEOID, ObjectIdGetDatum(nodeOid));
 +      if (!HeapTupleIsValid(oldtup))
 +              elog(ERROR, "cache lookup failed for object %u", nodeOid);
 +
 +      /*
 +       * check_options performs some internal checks on option values
 +       * so set up values.
 +       */
 +      node_host = get_pgxc_nodehost(nodeOid);
 +      node_port = get_pgxc_nodeport(nodeOid);
 +      is_preferred = is_pgxc_nodepreferred(nodeOid);
 +      is_primary = is_pgxc_nodeprimary(nodeOid);
 +      node_type = get_pgxc_nodetype(nodeOid);
 +      node_id = get_pgxc_node_id(nodeOid);
 +
 +      /* Filter options */
 +      check_node_options(node_name, stmt->options, &node_host,
 +                              &node_port, &node_type,
 +                              &is_primary, &is_preferred);
 +
 +      /*
 +       * Two nodes cannot be primary at the same time. If the primary
 +       * node is this node itself, well there is no point in having an
 +       * error.
 +       */
 +      if (is_primary &&
 +              OidIsValid(primary_data_node) &&
 +              nodeOid != primary_data_node)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("PGXC node %s: two nodes cannot be primary",
 +                                              node_name)));
 +
 +      /*
 +       * Get the count of datanodes and coordinators added so far and make sure
 +       * we're not exceeding the configured limits
 +       */
 +      count_coords_datanodes(rel, &coordCount, &dnCount);
 +
 +      if ((node_type == PGXC_NODE_DATANODE && dnCount >= MaxDataNodes) ||
 +              (node_type == PGXC_NODE_COORDINATOR && coordCount >= MaxCoords))
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
 +                               errmsg("cannot add more than %d %s",
 +                                       node_type == PGXC_NODE_COORDINATOR ?
 +                                       MaxCoords : MaxDataNodes,
 +                                       node_type == PGXC_NODE_COORDINATOR ?
 +                                       "coordinators" : "datanodes"),
 +                               errhint("increase the value of %s GUC and restart the cluster",
 +                                       node_type == PGXC_NODE_COORDINATOR ?
 +                                       "max_coordinators" : "max_datanodes"
 +                                       )));
 +
 +      }
 +
 +      /* Update values for catalog entry */
 +      MemSet(new_record, 0, sizeof(new_record));
 +      MemSet(new_record_nulls, false, sizeof(new_record_nulls));
 +      MemSet(new_record_repl, false, sizeof(new_record_repl));
 +      new_record[Anum_pgxc_node_port - 1] = Int32GetDatum(node_port);
 +      new_record_repl[Anum_pgxc_node_port - 1] = true;
 +      new_record[Anum_pgxc_node_host - 1] =
 +              DirectFunctionCall1(namein, CStringGetDatum(node_host));
 +      new_record_repl[Anum_pgxc_node_host - 1] = true;
 +      new_record[Anum_pgxc_node_type - 1] = CharGetDatum(node_type);
 +      new_record_repl[Anum_pgxc_node_type - 1] = true;
 +      new_record[Anum_pgxc_node_is_primary - 1] = BoolGetDatum(is_primary);
 +      new_record_repl[Anum_pgxc_node_is_primary - 1] = true;
 +      new_record[Anum_pgxc_node_is_preferred - 1] = BoolGetDatum(is_preferred);
 +      new_record_repl[Anum_pgxc_node_is_preferred - 1] = true;
 +      new_record[Anum_pgxc_node_id - 1] = UInt32GetDatum(node_id);
 +      new_record_repl[Anum_pgxc_node_id - 1] = true;
 +
 +      /* Update relation */
 +      newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel),
 +                                                         new_record,
 +                                                         new_record_nulls, new_record_repl);
++      CatalogTupleUpdate(rel, &oldtup->t_self, newtup);
 +
 +      /* Release lock at Commit */
 +      heap_close(rel, NoLock);
 +}
 +
 +
 +/*
 + * PgxcNodeRemove
 + *
 + * Remove a PGXC node
 + */
 +void
 +PgxcNodeRemove(DropNodeStmt *stmt)
 +{
 +      Relation        relation;
 +      HeapTuple       tup;
 +      const char      *node_name = stmt->node_name;
 +      Oid             noid = get_pgxc_nodeoid(node_name);
 +
 +      /* Only a DB administrator can remove cluster nodes */
 +      if (!superuser())
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 +                               errmsg("must be superuser to remove cluster nodes")));
 +
 +      /* Check if node is defined */
 +      if (!OidIsValid(noid))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                               errmsg("PGXC Node %s: object not defined",
 +                                              node_name)));
 +
 +      if (strcmp(node_name, PGXCNodeName) == 0)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("PGXC Node %s: cannot drop local node",
 +                                              node_name)));
 +
 +      /* PGXCTODO:
 +       * Is there any group which has this node as member
 +       * XC Tables will also have this as a member in their array
 +       * Do this search in the local data structure.
 +       * If a node is removed, it is necessary to check if there is a distributed
 +       * table on it. If there are only replicated table it is OK.
 +       * However, we have to be sure that there are no pooler agents in the cluster pointing to it.
 +       */
 +
 +      /* Delete the pgxc_node tuple */
 +      relation = heap_open(PgxcNodeRelationId, RowExclusiveLock);
 +      tup = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(noid));
 +      if (!HeapTupleIsValid(tup)) /* should not happen */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                               errmsg("PGXC Node %s: object not defined",
 +                                              node_name)));
 +
 +      simple_heap_delete(relation, &tup->t_self);
 +
 +      ReleaseSysCache(tup);
 +
 +      heap_close(relation, RowExclusiveLock);
 +}
index 21f155f5f74c92921b690f22f716cb9d1519d1a8,0000000000000000000000000000000000000000..59c5d8e7c04a6ba9a9ba635ce5c4d7ab31879f03
mode 100644,000000..100644
--- /dev/null
@@@ -1,6483 -1,0 +1,6483 @@@
-                                                                                  &isnull,
-                                                                                  NULL);
 +/*-------------------------------------------------------------------------
 + *
 + * execRemote.c
 + *
 + *      Functions to execute commands on remote Datanodes
 + *
 + *
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
 + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + *
 + * IDENTIFICATION
 + *      src/backend/pgxc/pool/execRemote.c
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include <time.h>
 +#include "postgres.h"
 +#include "access/twophase.h"
 +#include "access/gtm.h"
 +#include "access/sysattr.h"
 +#include "access/transam.h"
 +#include "access/xact.h"
 +#include "access/relscan.h"
 +#include "catalog/pg_type.h"
 +#include "catalog/pgxc_node.h"
 +#include "commands/prepare.h"
 +#include "executor/executor.h"
 +#include "gtm/gtm_c.h"
 +#include "libpq/libpq.h"
 +#include "miscadmin.h"
 +#include "pgxc/execRemote.h"
 +#include "tcop/tcopprot.h"
 +#include "executor/nodeSubplan.h"
 +#include "nodes/nodeFuncs.h"
 +#include "pgstat.h"
 +#include "nodes/nodes.h"
 +#include "nodes/nodeFuncs.h"
 +#include "optimizer/var.h"
 +#include "pgxc/copyops.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/poolmgr.h"
 +#include "storage/ipc.h"
 +#include "storage/proc.h"
 +#include "utils/datum.h"
 +#include "utils/lsyscache.h"
 +#include "utils/memutils.h"
 +#include "utils/pg_rusage.h"
 +#include "utils/tuplesort.h"
 +#include "utils/snapmgr.h"
 +#include "utils/builtins.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/pgxc.h"
 +#include "parser/parse_type.h"
 +#include "parser/parsetree.h"
 +#include "pgxc/xc_maintenance_mode.h"
 +
 +/*
 + * We do not want it too long, when query is terminating abnormally we just
 + * want to read in already available data, if datanode connection will reach a
 + * consistent state after that, we will go normal clean up procedure: send down
 + * ABORT etc., if data node is not responding we will signal pooler to drop
 + * the connection.
 + * It is better to drop and recreate datanode connection then wait for several
 + * seconds while it being cleaned up when, for example, cancelling query.
 + */
 +#define END_QUERY_TIMEOUT     1000
 +
 +/* Declarations used by guc.c */
 +int PGXLRemoteFetchSize;
 +
 +typedef struct
 +{
 +      xact_callback function;
 +      void *fparams;
 +} abort_callback_type;
 +
 +/*
 + * Buffer size does not affect performance significantly, just do not allow
 + * connection buffer grows infinitely
 + */
 +#define COPY_BUFFER_SIZE 8192
 +#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024
 +
 +/*
 + * Flag to track if a temporary object is accessed by the current transaction
 + */
 +static bool temp_object_included = false;
 +static abort_callback_type dbcleanup_info = { NULL, NULL };
 +
 +static int    pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections,
 +                              GlobalTransactionId gxid, bool need_tran_block,
 +                              bool readOnly, char node_type);
 +
 +static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate,
 +                                       ExecNodes *exec_nodes,
 +                                       RemoteQueryExecType exec_type,
 +                                       bool is_global_session);
 +
 +
 +static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection,
 +                                      RemoteQueryState *remotestate, Snapshot snapshot);
 +
 +static void pgxc_node_remote_count(int *dnCount, int dnNodeIds[],
 +              int *coordCount, int coordNodeIds[]);
 +static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode);
 +static bool pgxc_node_remote_finish(char *prepareGID, bool commit,
 +                                              char *nodestring, GlobalTransactionId gxid,
 +                                              GlobalTransactionId prepare_gxid);
 +static void pgxc_node_remote_commit(void);
 +static void pgxc_node_remote_abort(void);
 +static void pgxc_connections_cleanup(ResponseCombiner *combiner);
 +
 +static void pgxc_node_report_error(ResponseCombiner *combiner);
 +
 +#define REMOVE_CURR_CONN(combiner) \
 +      if ((combiner)->current_conn < --((combiner)->conn_count)) \
 +      { \
 +              (combiner)->connections[(combiner)->current_conn] = \
 +                              (combiner)->connections[(combiner)->conn_count]; \
 +      } \
 +      else \
 +              (combiner)->current_conn = 0
 +
 +#define MAX_STATEMENTS_PER_TRAN 10
 +
 +/* Variables to collect statistics */
 +static int    total_transactions = 0;
 +static int    total_statements = 0;
 +static int    total_autocommit = 0;
 +static int    nonautocommit_2pc = 0;
 +static int    autocommit_2pc = 0;
 +static int    current_tran_statements = 0;
 +static int *statements_per_transaction = NULL;
 +static int *nodes_per_transaction = NULL;
 +
 +/*
 + * statistics collection: count a statement
 + */
 +static void
 +stat_statement()
 +{
 +      total_statements++;
 +      current_tran_statements++;
 +}
 +
 +/*
 + * To collect statistics: count a transaction
 + */
 +static void
 +stat_transaction(int node_count)
 +{
 +      total_transactions++;
 +
 +      if (!statements_per_transaction)
 +      {
 +              statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
 +              memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
 +      }
 +      if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
 +              statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
 +      else
 +              statements_per_transaction[current_tran_statements]++;
 +      current_tran_statements = 0;
 +      if (node_count > 0 && node_count <= NumDataNodes)
 +      {
 +              if (!nodes_per_transaction)
 +              {
 +                      nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
 +                      memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
 +              }
 +              nodes_per_transaction[node_count - 1]++;
 +      }
 +}
 +
 +
 +/*
 + * Output collected statistics to the log
 + */
 +static void
 +stat_log()
 +{
 +      elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
 +      elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
 +               total_autocommit, autocommit_2pc, nonautocommit_2pc);
 +      if (total_transactions)
 +      {
 +              if (statements_per_transaction)
 +              {
 +                      int                     i;
 +
 +                      for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
 +                              elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
 +                                       i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
 +              }
 +              elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
 +                       MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
 +              if (nodes_per_transaction)
 +              {
 +                      int                     i;
 +
 +                      for (i = 0; i < NumDataNodes; i++)
 +                              elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
 +                                       i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
 +              }
 +      }
 +}
 +
 +
 +/*
 + * Create a structure to store parameters needed to combine responses from
 + * multiple connections as well as state information
 + */
 +void
 +InitResponseCombiner(ResponseCombiner *combiner, int node_count,
 +                                         CombineType combine_type)
 +{
 +      combiner->node_count = node_count;
 +      combiner->connections = NULL;
 +      combiner->conn_count = 0;
 +      combiner->combine_type = combine_type;
 +      combiner->command_complete_count = 0;
 +      combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
 +      combiner->description_count = 0;
 +      combiner->copy_in_count = 0;
 +      combiner->copy_out_count = 0;
 +      combiner->copy_file = NULL;
 +      combiner->errorMessage = NULL;
 +      combiner->errorDetail = NULL;
 +      combiner->errorHint = NULL;
 +      combiner->tuple_desc = NULL;
 +      combiner->probing_primary = false;
 +      combiner->returning_node = InvalidOid;
 +      combiner->currentRow = NULL;
 +      combiner->rowBuffer = NIL;
 +      combiner->tapenodes = NULL;
 +      combiner->merge_sort = false;
 +      combiner->extended_query = false;
 +      combiner->tapemarks = NULL;
 +      combiner->tuplesortstate = NULL;
 +      combiner->cursor = NULL;
 +      combiner->update_cursor = NULL;
 +      combiner->cursor_count = 0;
 +      combiner->cursor_connections = NULL;
 +      combiner->remoteCopyType = REMOTE_COPY_NONE;
 +}
 +
 +
 +/*
 + * Parse out row count from the command status response and convert it to integer
 + */
 +static int
 +parse_row_count(const char *message, size_t len, uint64 *rowcount)
 +{
 +      int                     digits = 0;
 +      int                     pos;
 +
 +      *rowcount = 0;
 +      /* skip \0 string terminator */
 +      for (pos = 0; pos < len - 1; pos++)
 +      {
 +              if (message[pos] >= '0' && message[pos] <= '9')
 +              {
 +                      *rowcount = *rowcount * 10 + message[pos] - '0';
 +                      digits++;
 +              }
 +              else
 +              {
 +                      *rowcount = 0;
 +                      digits = 0;
 +              }
 +      }
 +      return digits;
 +}
 +
 +/*
 + * Convert RowDescription message to a TupleDesc
 + */
 +static TupleDesc
 +create_tuple_desc(char *msg_body, size_t len)
 +{
 +      TupleDesc       result;
 +      int             i, nattr;
 +      uint16          n16;
 +
 +      /* get number of attributes */
 +      memcpy(&n16, msg_body, 2);
 +      nattr = ntohs(n16);
 +      msg_body += 2;
 +
 +      result = CreateTemplateTupleDesc(nattr, false);
 +
 +      /* decode attributes */
 +      for (i = 1; i <= nattr; i++)
 +      {
 +              AttrNumber      attnum;
 +              char            *attname;
 +              char            *typname;
 +              Oid             oidtypeid;
 +              int32           typemode, typmod;
 +
 +              attnum = (AttrNumber) i;
 +
 +              /* attribute name */
 +              attname = msg_body;
 +              msg_body += strlen(attname) + 1;
 +
 +              /* type name */
 +              typname = msg_body;
 +              msg_body += strlen(typname) + 1;
 +
 +              /* table OID, ignored */
 +              msg_body += 4;
 +
 +              /* column no, ignored */
 +              msg_body += 2;
 +
 +              /* data type OID, ignored */
 +              msg_body += 4;
 +
 +              /* type len, ignored */
 +              msg_body += 2;
 +
 +              /* type mod */
 +              memcpy(&typemode, msg_body, 4);
 +              typmod = ntohl(typemode);
 +              msg_body += 4;
 +
 +              /* PGXCTODO text/binary flag? */
 +              msg_body += 2;
 +
 +              /* Get the OID type and mode type from typename */
 +              parseTypeString(typname, &oidtypeid, NULL, false);
 +
 +              TupleDescInitEntry(result, attnum, attname, oidtypeid, typmod, 0);
 +      }
 +      return result;
 +}
 +
 +/*
 + * Handle CopyOutCommandComplete ('c') message from a Datanode connection
 + */
 +static void
 +HandleCopyOutComplete(ResponseCombiner *combiner)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
 +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'c' message, current request type %d", combiner->request_type)));
 +      /* Just do nothing, close message is managed by the Coordinator */
 +      combiner->copy_out_count++;
 +}
 +
 +/*
 + * Handle CommandComplete ('C') message from a Datanode connection
 + */
 +static void
 +HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
 +{
 +      int                     digits = 0;
 +      EState             *estate = combiner->ss.ps.state;
 +
 +      /*
 +       * If we did not receive description we are having rowcount or OK response
 +       */
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COMMAND;
 +      /* Extract rowcount */
 +      if (combiner->combine_type != COMBINE_TYPE_NONE && estate)
 +      {
 +              uint64  rowcount;
 +              digits = parse_row_count(msg_body, len, &rowcount);
 +              if (digits > 0)
 +              {
 +                      /* Replicated write, make sure they are the same */
 +                      if (combiner->combine_type == COMBINE_TYPE_SAME)
 +                      {
 +                              if (combiner->command_complete_count)
 +                              {
 +                                      /*
 +                                       * Replicated command may succeed on on node and fail on
 +                                       * another. The example is if distributed table referenced
 +                                       * by a foreign key constraint defined on a partitioned
 +                                       * table. If command deletes rows from the replicated table
 +                                       * they may be referenced on one Datanode but not on other.
 +                                       * So, replicated command on each Datanode either affects
 +                                       * proper number of rows, or returns error. Here if
 +                                       * combiner got an error already, we allow to report it,
 +                                       * not the scaring data corruption message.
 +                                       */
 +                                      if (combiner->errorMessage == NULL && rowcount != estate->es_processed)
 +                                              /* There is a consistency issue in the database with the replicated table */
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                                                               errmsg("Write to replicated table returned different results from the Datanodes")));
 +                              }
 +                              else
 +                                      /* first result */
 +                                      estate->es_processed = rowcount;
 +                      }
 +                      else
 +                              estate->es_processed += rowcount;
 +              }
 +              else
 +                      combiner->combine_type = COMBINE_TYPE_NONE;
 +      }
 +
 +      /* If response checking is enable only then do further processing */
 +      if (conn->ck_resp_rollback)
 +      {
 +              if (strcmp(msg_body, "ROLLBACK") == 0)
 +              {
 +                      /*
 +                       * Subsequent clean up routine will be checking this flag
 +                       * to determine nodes where to send ROLLBACK PREPARED.
 +                       * On current node PREPARE has failed and the two-phase record
 +                       * does not exist, so clean this flag as if PREPARE was not sent
 +                       * to that node and avoid erroneous command.
 +                       */
 +                      conn->ck_resp_rollback = false;
 +                      /*
 +                       * Set the error, if none, to force throwing.
 +                       * If there is error already, it will be thrown anyway, do not add
 +                       * this potentially confusing message
 +                       */
 +                      if (combiner->errorMessage == NULL)
 +                      {
 +                              MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
 +                              combiner->errorMessage =
 +                                                              pstrdup("unexpected ROLLBACK from remote node");
 +                              MemoryContextSwitchTo(oldcontext);
 +                              /*
 +                               * ERRMSG_PRODUCER_ERROR
 +                               * Messages with this code are replaced by others, if they are
 +                               * received, so if node will send relevant error message that
 +                               * one will be replaced.
 +                               */
 +                              combiner->errorCode[0] = 'X';
 +                              combiner->errorCode[1] = 'X';
 +                              combiner->errorCode[2] = '0';
 +                              combiner->errorCode[3] = '1';
 +                              combiner->errorCode[4] = '0';
 +                      }
 +              }
 +      }
 +      combiner->command_complete_count++;
 +}
 +
 +/*
 + * Handle RowDescription ('T') message from a Datanode connection
 + */
 +static bool
 +HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return false;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_QUERY;
 +      if (combiner->request_type != REQUEST_TYPE_QUERY)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type)));
 +      }
 +      /* Increment counter and check if it was first */
 +      if (combiner->description_count++ == 0)
 +      {
 +              combiner->tuple_desc = create_tuple_desc(msg_body, len);
 +              return true;
 +      }
 +      return false;
 +}
 +
 +
 +/*
 + * Handle CopyInResponse ('G') message from a Datanode connection
 + */
 +static void
 +HandleCopyIn(ResponseCombiner *combiner)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_IN;
 +      if (combiner->request_type != REQUEST_TYPE_COPY_IN)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'G' message, current request type %d", combiner->request_type)));
 +      }
 +      /*
 +       * The normal PG code will output an G message when it runs in the
 +       * Coordinator, so do not proxy message here, just count it.
 +       */
 +      combiner->copy_in_count++;
 +}
 +
 +/*
 + * Handle CopyOutResponse ('H') message from a Datanode connection
 + */
 +static void
 +HandleCopyOut(ResponseCombiner *combiner)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
 +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'H' message, current request type %d", combiner->request_type)));
 +      }
 +      /*
 +       * The normal PG code will output an H message when it runs in the
 +       * Coordinator, so do not proxy message here, just count it.
 +       */
 +      combiner->copy_out_count++;
 +}
 +
 +/*
 + * Handle CopyOutDataRow ('d') message from a Datanode connection
 + */
 +static void
 +HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len)
 +{
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return;
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              combiner->request_type = REQUEST_TYPE_COPY_OUT;
 +
 +      /* Inconsistent responses */
 +      if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the Datanodes for 'd' message, current request type %d", combiner->request_type)));
 +
 +      /* count the row */
 +      combiner->processed++;
 +
 +      /* Output remote COPY operation to correct location */
 +      switch (combiner->remoteCopyType)
 +      {
 +              case REMOTE_COPY_FILE:
 +                      /* Write data directly to file */
 +                      fwrite(msg_body, 1, len, combiner->copy_file);
 +                      break;
 +              case REMOTE_COPY_STDOUT:
 +                      /* Send back data to client */
 +                      pq_putmessage('d', msg_body, len);
 +                      break;
 +              case REMOTE_COPY_TUPLESTORE:
 +                      /*
 +                       * Do not store trailing \n character.
 +                       * When tuplestore data are loaded to a table it automatically
 +                       * inserts line ends.
 +                       */
 +                      tuplestore_putmessage(combiner->tuplestorestate, len-1, msg_body);
 +                      break;
 +              case REMOTE_COPY_NONE:
 +              default:
 +                      Assert(0); /* Should not happen */
 +      }
 +}
 +
 +/*
 + * Handle DataRow ('D') message from a Datanode connection
 + * The function returns true if data row is accepted and successfully stored
 + * within the combiner.
 + */
 +static bool
 +HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node)
 +{
 +      /* We expect previous message is consumed */
 +      Assert(combiner->currentRow == NULL);
 +
 +      if (combiner->request_type == REQUEST_TYPE_ERROR)
 +              return false;
 +
 +      if (combiner->request_type != REQUEST_TYPE_QUERY)
 +      {
 +              /* Inconsistent responses */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type)));
 +      }
 +
 +      /*
 +       * If we got an error already ignore incoming data rows from other nodes
 +       * Still we want to continue reading until get CommandComplete
 +       */
 +      if (combiner->errorMessage)
 +              return false;
 +
 +      /*
 +       * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples
 +       * from one node, skip others as duplicates
 +       */
 +      if (combiner->combine_type == COMBINE_TYPE_SAME)
 +      {
 +              /* Do not return rows when probing primary, instead return when doing
 +               * first normal node. Just save some CPU and traffic in case if
 +               * probing fails.
 +               */
 +              if (combiner->probing_primary)
 +                      return false;
 +              if (OidIsValid(combiner->returning_node))
 +              {
 +                      if (combiner->returning_node != node)
 +                              return false;
 +              }
 +              else
 +                      combiner->returning_node = node;
 +      }
 +
 +      /*
 +       * We are copying message because it points into connection buffer, and
 +       * will be overwritten on next socket read
 +       */
 +      combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len);
 +      memcpy(combiner->currentRow->msg, msg_body, len);
 +      combiner->currentRow->msglen = len;
 +      combiner->currentRow->msgnode = node;
 +
 +      return true;
 +}
 +
 +/*
 + * Handle ErrorResponse ('E') message from a Datanode connection
 + */
 +static void
 +HandleError(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
 +{
 +      /* parse error message */
 +      char *code = NULL;
 +      char *message = NULL;
 +      char *detail = NULL;
 +      char *hint = NULL;
 +      int   offset = 0;
 +
 +      /*
 +       * Scan until point to terminating \0
 +       */
 +      while (offset + 1 < len)
 +      {
 +              /* pointer to the field message */
 +              char *str = msg_body + offset + 1;
 +
 +              switch (msg_body[offset])
 +              {
 +                      case 'C':       /* code */
 +                              code = str;
 +                              break;
 +                      case 'M':       /* message */
 +                              message = str;
 +                              break;
 +                      case 'D':       /* details */
 +                              detail = str;
 +                              break;
 +
 +                      case 'H':       /* hint */
 +                              hint = str;
 +                              break;
 +
 +                      /* Fields not yet in use */
 +                      case 'S':       /* severity */
 +                      case 'R':       /* routine */
 +                      case 'P':       /* position string */
 +                      case 'p':       /* position int */
 +                      case 'q':       /* int query */
 +                      case 'W':       /* where */
 +                      case 'F':       /* file */
 +                      case 'L':       /* line */
 +                      default:
 +                              break;
 +              }
 +
 +              /* code, message and \0 */
 +              offset += strlen(str) + 2;
 +      }
 +
 +      /*
 +       * We may have special handling for some errors, default handling is to
 +       * throw out error with the same message. We can not ereport immediately
 +       * because we should read from this and other connections until
 +       * ReadyForQuery is received, so we just store the error message.
 +       * If multiple connections return errors only first one is reported.
 +       *
 +       * The producer error may be hiding primary error, so if previously received
 +       * error is a producer error allow it to be overwritten.
 +       */
 +      if (combiner->errorMessage == NULL ||
 +                      MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1],
 +                                                combiner->errorCode[2], combiner->errorCode[3],
 +                                                combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR)
 +      {
 +              MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
 +              combiner->errorMessage = pstrdup(message);
 +              /* Error Code is exactly 5 significant bytes */
 +              if (code)
 +                      memcpy(combiner->errorCode, code, 5);
 +              if (detail)
 +                      combiner->errorDetail = pstrdup(detail);
 +              if (hint)
 +                      combiner->errorHint = pstrdup(hint);
 +              MemoryContextSwitchTo(oldcontext);
 +      }
 +
 +      /*
 +       * If the PREPARE TRANSACTION command fails for whatever reason, we don't
 +       * want to send down ROLLBACK PREPARED to this node. Otherwise, it may end
 +       * up rolling back an unrelated prepared transaction with the same GID as
 +       * used by this transaction
 +       */
 +      if (conn->ck_resp_rollback)
 +              conn->ck_resp_rollback = false;
 +
 +      /*
 +       * If Datanode have sent ErrorResponse it will never send CommandComplete.
 +       * Increment the counter to prevent endless waiting for it.
 +       */
 +      combiner->command_complete_count++;
 +}
 +
 +/*
 + * HandleCmdComplete -
 + *    combine deparsed sql statements execution results
 + *
 + * Input parameters:
 + *    commandType is dml command type
 + *    combineTag is used to combine the completion result
 + *    msg_body is execution result needed to combine
 + *    len is msg_body size
 + */
 +void
 +HandleCmdComplete(CmdType commandType, CombineTag *combine,
 +                                              const char *msg_body, size_t len)
 +{
 +      int     digits = 0;
 +      uint64  originrowcount = 0;
 +      uint64  rowcount = 0;
 +      uint64  total = 0;
 +
 +      if (msg_body == NULL)
 +              return;
 +
 +      /* if there's nothing in combine, just copy the msg_body */
 +      if (strlen(combine->data) == 0)
 +      {
 +              strcpy(combine->data, msg_body);
 +              combine->cmdType = commandType;
 +              return;
 +      }
 +      else
 +      {
 +              /* commandType is conflict */
 +              if (combine->cmdType != commandType)
 +                      return;
 +
 +              /* get the processed row number from msg_body */
 +              digits = parse_row_count(msg_body, len + 1, &rowcount);
 +              elog(DEBUG1, "digits is %d\n", digits);
 +              Assert(digits >= 0);
 +
 +              /* no need to combine */
 +              if (digits == 0)
 +                      return;
 +
 +              /* combine the processed row number */
 +              parse_row_count(combine->data, strlen(combine->data) + 1, &originrowcount);
 +              elog(DEBUG1, "originrowcount is %lu, rowcount is %lu\n", originrowcount, rowcount);
 +              total = originrowcount + rowcount;
 +
 +      }
 +
 +      /* output command completion tag */
 +      switch (commandType)
 +      {
 +              case CMD_SELECT:
 +                      strcpy(combine->data, "SELECT");
 +                      break;
 +              case CMD_INSERT:
 +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
 +                         "INSERT %u %lu", 0, total);
 +                      break;
 +              case CMD_UPDATE:
 +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
 +                                       "UPDATE %lu", total);
 +                      break;
 +              case CMD_DELETE:
 +                      snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
 +                                       "DELETE %lu", total);
 +                      break;
 +              default:
 +                      strcpy(combine->data, "");
 +                      break;
 +      }
 +
 +}
 +
 +/*
 + * HandleDatanodeCommandId ('M') message from a Datanode connection
 + */
 +static void
 +HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len)
 +{
 +      uint32          n32;
 +      CommandId       cid;
 +
 +      Assert(msg_body != NULL);
 +      Assert(len >= 2);
 +
 +      /* Get the command Id */
 +      memcpy(&n32, &msg_body[0], 4);
 +      cid = ntohl(n32);
 +
 +      /* If received command Id is higher than current one, set it to a new value */
 +      if (cid > GetReceivedCommandId())
 +              SetReceivedCommandId(cid);
 +}
 +
 +/*
 + * Record waited-for XIDs received from the remote nodes into the transaction
 + * state
 + */
 +static void
 +HandleWaitXids(char *msg_body, size_t len)
 +{
 +      int xid_count;
 +      uint32          n32;
 +      int cur;
 +      int i;
 +
 +      /* Get the xid count */
 +      xid_count = len / sizeof (TransactionId);
 +
 +      cur = 0;
 +      for (i = 0; i < xid_count; i++)
 +      {
 +              Assert(cur < len);
 +              memcpy(&n32, &msg_body[cur], sizeof (TransactionId));
 +              cur = cur + sizeof (TransactionId);
 +              TransactionRecordXidWait(ntohl(n32));
 +      }
 +}
 +
 +static void
 +HandleGlobalTransactionId(char *msg_body, size_t len)
 +{
 +      GlobalTransactionId xid;
 +
 +      Assert(len == sizeof (GlobalTransactionId));
 +      memcpy(&xid, &msg_body[0], sizeof (GlobalTransactionId));
 +
 +      SetTopTransactionId(xid);
 +}
 +
 +/*
 + * Examine the specified combiner state and determine if command was completed
 + * successfully
 + */
 +static bool
 +validate_combiner(ResponseCombiner *combiner)
 +{
 +      /* There was error message while combining */
 +      if (combiner->errorMessage)
 +              return false;
 +      /* Check if state is defined */
 +      if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
 +              return false;
 +
 +      /* Check all nodes completed */
 +      if ((combiner->request_type == REQUEST_TYPE_COMMAND
 +              || combiner->request_type == REQUEST_TYPE_QUERY)
 +              && combiner->command_complete_count != combiner->node_count)
 +              return false;
 +
 +      /* Check count of description responses */
 +      if (combiner->request_type == REQUEST_TYPE_QUERY
 +              && combiner->description_count != combiner->node_count)
 +              return false;
 +
 +      /* Check count of copy-in responses */
 +      if (combiner->request_type == REQUEST_TYPE_COPY_IN
 +              && combiner->copy_in_count != combiner->node_count)
 +              return false;
 +
 +      /* Check count of copy-out responses */
 +      if (combiner->request_type == REQUEST_TYPE_COPY_OUT
 +              && combiner->copy_out_count != combiner->node_count)
 +              return false;
 +
 +      /* Add other checks here as needed */
 +
 +      /* All is good if we are here */
 +      return true;
 +}
 +
 +/*
 + * Close combiner and free allocated memory, if it is not needed
 + */
 +void
 +CloseCombiner(ResponseCombiner *combiner)
 +{
 +      if (combiner->connections)
 +              pfree(combiner->connections);
 +      if (combiner->tuple_desc)
 +              FreeTupleDesc(combiner->tuple_desc);
 +      if (combiner->errorMessage)
 +              pfree(combiner->errorMessage);
 +      if (combiner->errorDetail)
 +              pfree(combiner->errorDetail);
 +      if (combiner->errorHint)
 +              pfree(combiner->errorHint);
 +      if (combiner->cursor_connections)
 +              pfree(combiner->cursor_connections);
 +      if (combiner->tapenodes)
 +              pfree(combiner->tapenodes);
 +      if (combiner->tapemarks)
 +              pfree(combiner->tapemarks);
 +}
 +
 +/*
 + * Validate combiner and release storage freeing allocated memory
 + */
 +static bool
 +ValidateAndCloseCombiner(ResponseCombiner *combiner)
 +{
 +      bool            valid = validate_combiner(combiner);
 +
 +      CloseCombiner(combiner);
 +
 +      return valid;
 +}
 +
 +/*
 + * It is possible if multiple steps share the same Datanode connection, when
 + * executor is running multi-step query or client is running multiple queries
 + * using Extended Query Protocol. After returning next tuple ExecRemoteQuery
 + * function passes execution control to the executor and then it can be given
 + * to the same RemoteQuery or to different one. It is possible that before
 + * returning a tuple the function do not read all Datanode responses. In this
 + * case pending responses should be read in context of original RemoteQueryState
 + * till ReadyForQuery message and data rows should be stored (buffered) to be
 + * available when fetch from that RemoteQueryState is requested again.
 + * BufferConnection function does the job.
 + * If a RemoteQuery is going to use connection it should check connection state.
 + * DN_CONNECTION_STATE_QUERY indicates query has data to read and combiner
 + * points to the original RemoteQueryState. If combiner differs from "this" the
 + * connection should be buffered.
 + */
 +void
 +BufferConnection(PGXCNodeHandle *conn)
 +{
 +      ResponseCombiner *combiner = conn->combiner;
 +      MemoryContext oldcontext;
 +
 +      if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY)
 +              return;
 +
 +      elog(DEBUG2, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor);
 +
 +      /*
 +       * When BufferConnection is invoked CurrentContext is related to other
 +       * portal, which is trying to control the connection.
 +       * TODO See if we can find better context to switch to
 +       */
 +      oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt);
 +
 +      /* Verify the connection is in use by the combiner */
 +      combiner->current_conn = 0;
 +      while (combiner->current_conn < combiner->conn_count)
 +      {
 +              if (combiner->connections[combiner->current_conn] == conn)
 +                      break;
 +              combiner->current_conn++;
 +      }
 +      Assert(combiner->current_conn < combiner->conn_count);
 +
 +      if (combiner->tapemarks == NULL)
 +              combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*));
 +
 +      /*
 +       * If current bookmark for the current tape is not set it means either
 +       * first row in the buffer is from the current tape or no rows from
 +       * the tape in the buffer, so if first row is not from current
 +       * connection bookmark the last cell in the list.
 +       */
 +      if (combiner->tapemarks[combiner->current_conn] == NULL &&
 +                      list_length(combiner->rowBuffer) > 0)
 +      {
 +              RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
 +              if (dataRow->msgnode != conn->nodeoid)
 +                      combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer);
 +      }
 +
 +      /*
 +       * Buffer data rows until data node return number of rows specified by the
 +       * fetch_size parameter of last Execute message (PortalSuspended message)
 +       * or end of result set is reached (CommandComplete message)
 +       */
 +      while (true)
 +      {
 +              int res;
 +
 +              /* Move to buffer currentRow (received from the data node) */
 +              if (combiner->currentRow)
 +              {
 +                      combiner->rowBuffer = lappend(combiner->rowBuffer,
 +                                                                                combiner->currentRow);
 +                      combiner->currentRow = NULL;
 +              }
 +
 +              res = handle_response(conn, combiner);
 +              /*
 +               * If response message is a DataRow it will be handled on the next
 +               * iteration.
 +               * PortalSuspended will cause connection state change and break the loop
 +               * The same is for CommandComplete, but we need additional handling -
 +               * remove connection from the list of active connections.
 +               * We may need to add handling error response
 +               */
 +
 +              /* Most often result check first */
 +              if (res == RESPONSE_DATAROW)
 +              {
 +                      /*
 +                       * The row is in the combiner->currentRow, on next iteration it will
 +                       * be moved to the buffer
 +                       */
 +                      continue;
 +              }
 +
 +              /* incomplete message, read more */
 +              if (res == RESPONSE_EOF)
 +              {
 +                      if (pgxc_node_receive(1, &conn, NULL))
 +                      {
 +                              PGXCNodeSetConnectionState(conn,
 +                                              DN_CONNECTION_STATE_ERROR_FATAL);
 +                              add_error_message(conn, "Failed to fetch from data node");
 +                      }
 +              }
 +
 +              /*
 +               * End of result set is reached, so either set the pointer to the
 +               * connection to NULL (combiner with sort) or remove it from the list
 +               * (combiner without sort)
 +               */
 +              else if (res == RESPONSE_COMPLETE)
 +              {
 +                      /*
 +                       * If combiner is doing merge sort we should set reference to the
 +                       * current connection to NULL in the array, indicating the end
 +                       * of the tape is reached. FetchTuple will try to access the buffer
 +                       * first anyway.
 +                       * Since we remove that reference we can not determine what node
 +                       * number was this connection, but we need this info to find proper
 +                       * tuple in the buffer if we are doing merge sort. So store node
 +                       * number in special array.
 +                       * NB: We can not test if combiner->tuplesortstate is set here:
 +                       * connection may require buffering inside tuplesort_begin_merge
 +                       * - while pre-read rows from the tapes, one of the tapes may be
 +                       * the local connection with RemoteSubplan in the tree. The
 +                       * combiner->tuplesortstate is set only after tuplesort_begin_merge
 +                       * returns.
 +                       */
 +                      if (combiner->merge_sort)
 +                      {
 +                              combiner->connections[combiner->current_conn] = NULL;
 +                              if (combiner->tapenodes == NULL)
 +                                      combiner->tapenodes = (Oid *)
 +                                                      palloc0(combiner->conn_count * sizeof(Oid));
 +                              combiner->tapenodes[combiner->current_conn] = conn->nodeoid;
 +                      }
 +                      else
 +                      {
 +                              /* Remove current connection, move last in-place, adjust current_conn */
 +                              if (combiner->current_conn < --combiner->conn_count)
 +                                      combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
 +                              else
 +                                      combiner->current_conn = 0;
 +                      }
 +                      /*
 +                       * If combiner runs Simple Query Protocol we need to read in
 +                       * ReadyForQuery. In case of Extended Query Protocol it is not
 +                       * sent and we should quit.
 +                       */
 +                      if (combiner->extended_query)
 +                              break;
 +              }
 +              else if (res == RESPONSE_ERROR)
 +              {
 +                      if (combiner->extended_query)
 +                      {
 +                              /*
 +                               * Need to sync connection to enable receiving commands
 +                               * by the datanode
 +                               */
 +                              if (pgxc_node_send_sync(conn) != 0)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to sync msg to node %u", conn->nodeoid)));
 +                              }
 +                      }
 +              }
 +              else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY)
 +              {
 +                      /* Now it is OK to quit */
 +                      break;
 +              }
 +      }
 +      Assert(conn->state != DN_CONNECTION_STATE_QUERY);
 +      MemoryContextSwitchTo(oldcontext);
 +      conn->combiner = NULL;
 +}
 +
 +/*
 + * copy the datarow from combiner to the given slot, in the slot's memory
 + * context
 + */
 +static void
 +CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot)
 +{
 +      RemoteDataRow   datarow;
 +      MemoryContext   oldcontext;
 +      oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
 +      datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen);
 +      datarow->msgnode = combiner->currentRow->msgnode;
 +      datarow->msglen = combiner->currentRow->msglen;
 +      memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen);
 +      ExecStoreDataRowTuple(datarow, slot, true);
 +      pfree(combiner->currentRow);
 +      combiner->currentRow = NULL;
 +      MemoryContextSwitchTo(oldcontext);
 +}
 +
 +
 +/*
 + * FetchTuple
 + *
 +              Get next tuple from one of the datanode connections.
 + * The connections should be in combiner->connections, if "local" dummy
 + * connection presents it should be the last active connection in the array.
 + *      If combiner is set up to perform merge sort function returns tuple from
 + * connection defined by combiner->current_conn, or NULL slot if no more tuple
 + * are available from the connection. Otherwise it returns tuple from any
 + * connection or NULL slot if no more available connections.
 + *            Function looks into combiner->rowBuffer before accessing connection
 + * and return a tuple from there if found.
 + *            Function may wait while more data arrive from the data nodes. If there
 + * is a locally executed subplan function advance it and buffer resulting rows
 + * instead of waiting.
 + */
 +TupleTableSlot *
 +FetchTuple(ResponseCombiner *combiner)
 +{
 +      PGXCNodeHandle *conn;
 +      TupleTableSlot *slot;
 +      Oid                     nodeOid = -1;
 +
 +      /*
 +       * Case if we run local subplan.
 +       * We do not have remote connections, so just get local tuple and return it
 +       */
 +      if (outerPlanState(combiner))
 +      {
 +              RemoteSubplanState *planstate = (RemoteSubplanState *) combiner;
 +              RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
 +              /* Advance subplan in a loop until we have something to return */
 +              for (;;)
 +              {
 +                      Datum   value = (Datum) 0;
 +                      bool    isnull = false;
 +                      int     numnodes;
 +                      int             i;
 +
 +                      slot = ExecProcNode(outerPlanState(combiner));
 +                      /* If locator is not defined deliver all the results */
 +                      if (planstate->locator == NULL)
 +                              return slot;
 +
 +                      /*
 +                       * If NULL tuple is returned we done with the subplan, finish it up and
 +                       * return NULL
 +                       */
 +                      if (TupIsNull(slot))
 +                              return NULL;
 +
 +                      /* Get partitioning value if defined */
 +                      if (plan->distributionKey != InvalidAttrNumber)
 +                              value = slot_getattr(slot, plan->distributionKey, &isnull);
 +
 +                      /* Determine target nodes */
 +                      numnodes = GET_NODES(planstate->locator, value, isnull, NULL);
 +                      for (i = 0; i < numnodes; i++)
 +                      {
 +                              /* Deliver the node */
 +                              if (planstate->dest_nodes[i] == PGXCNodeId-1)
 +                                      return slot;
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Get current connection
 +       */
 +      if (combiner->conn_count > combiner->current_conn)
 +              conn = combiner->connections[combiner->current_conn];
 +      else
 +              conn = NULL;
 +
 +      /*
 +       * If doing merge sort determine the node number.
 +       * It may be needed to get buffered row.
 +       */
 +      if (combiner->merge_sort)
 +      {
 +              Assert(conn || combiner->tapenodes);
 +              nodeOid = conn ? conn->nodeoid :
 +                                               combiner->tapenodes[combiner->current_conn];
 +              Assert(OidIsValid(nodeOid));
 +      }
 +
 +      /*
 +       * First look into the row buffer.
 +       * When we are performing merge sort we need to get from the buffer record
 +       * from the connection marked as "current". Otherwise get first.
 +       */
 +      if (list_length(combiner->rowBuffer) > 0)
 +      {
 +              RemoteDataRow dataRow;
 +
 +              Assert(combiner->currentRow == NULL);
 +
 +              if (combiner->merge_sort)
 +              {
 +                      ListCell *lc;
 +                      ListCell *prev;
 +
 +                      elog(DEBUG1, "Getting buffered tuple from node %x", nodeOid);
 +
 +                      prev = combiner->tapemarks[combiner->current_conn];
 +                      if (prev)
 +                      {
 +                              /*
 +                               * Start looking through the list from the bookmark.
 +                               * Probably the first cell we check contains row from the needed
 +                               * node. Otherwise continue scanning until we encounter one,
 +                               * advancing prev pointer as well.
 +                               */
 +                              while((lc = lnext(prev)) != NULL)
 +                              {
 +                                      dataRow = (RemoteDataRow) lfirst(lc);
 +                                      if (dataRow->msgnode == nodeOid)
 +                                      {
 +                                              combiner->currentRow = dataRow;
 +                                              break;
 +                                      }
 +                                      prev = lc;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              /*
 +                               * Either needed row is the first in the buffer or no such row
 +                               */
 +                              lc = list_head(combiner->rowBuffer);
 +                              dataRow = (RemoteDataRow) lfirst(lc);
 +                              if (dataRow->msgnode == nodeOid)
 +                                      combiner->currentRow = dataRow;
 +                              else
 +                                      lc = NULL;
 +                      }
 +                      if (lc)
 +                      {
 +                              /*
 +                               * Delete cell from the buffer. Before we delete we must check
 +                               * the bookmarks, if the cell is a bookmark for any tape.
 +                               * If it is the case we are deleting last row of the current
 +                               * block from the current tape. That tape should have bookmark
 +                               * like current, and current bookmark will be advanced when we
 +                               * read the tape once again.
 +                               */
 +                              int i;
 +                              for (i = 0; i < combiner->conn_count; i++)
 +                              {
 +                                      if (combiner->tapemarks[i] == lc)
 +                                              combiner->tapemarks[i] = prev;
 +                              }
 +                              elog(DEBUG1, "Found buffered tuple from node %x", nodeOid);
 +                              combiner->rowBuffer = list_delete_cell(combiner->rowBuffer,
 +                                                                                                         lc, prev);
 +                      }
 +                      elog(DEBUG1, "Update tapemark");
 +                      combiner->tapemarks[combiner->current_conn] = prev;
 +              }
 +              else
 +              {
 +                      dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
 +                      combiner->currentRow = dataRow;
 +                      combiner->rowBuffer = list_delete_first(combiner->rowBuffer);
 +              }
 +      }
 +
 +      /* If we have node message in the currentRow slot, and it is from a proper
 +       * node, consume it.  */
 +      if (combiner->currentRow)
 +      {
 +              Assert(!combiner->merge_sort ||
 +                         combiner->currentRow->msgnode == nodeOid);
 +              slot = combiner->ss.ps.ps_ResultTupleSlot;
 +              CopyDataRowTupleToSlot(combiner, slot);
 +              return slot;
 +      }
 +
 +      while (conn)
 +      {
 +              int res;
 +
 +              /* Going to use a connection, buffer it if needed */
 +              CHECK_OWNERSHIP(conn, combiner);
 +
 +              /*
 +               * If current connection is idle it means portal on the data node is
 +               * suspended. Request more and try to get it
 +               */
 +              if (combiner->extended_query &&
 +                              conn->state == DN_CONNECTION_STATE_IDLE)
 +              {
 +                      /*
 +                       * We do not allow to suspend if querying primary node, so that
 +                       * only may mean the current node is secondary and subplan was not
 +                       * executed there yet. Return and go on with second phase.
 +                       */
 +                      if (combiner->probing_primary)
 +                      {
 +                              return NULL;
 +                      }
 +
 +                      if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (pgxc_node_send_flush(conn) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (pgxc_node_receive(1, &conn, NULL))
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed receive data from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
 +                      }
 +              }
 +
 +              /* read messages */
 +              res = handle_response(conn, combiner);
 +              if (res == RESPONSE_DATAROW)
 +              {
 +                      slot = combiner->ss.ps.ps_ResultTupleSlot;
 +                      CopyDataRowTupleToSlot(combiner, slot);
 +                      return slot;
 +              }
 +              else if (res == RESPONSE_EOF)
 +              {
 +                      /* incomplete message, read more */
 +                      if (pgxc_node_receive(1, &conn, NULL))
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to receive more data from data node %u", conn->nodeoid)));
 +                      continue;
 +              }
 +              else if (res == RESPONSE_SUSPENDED)
 +              {
 +                      /*
 +                       * If we are doing merge sort or probing primary node we should
 +                       * remain on the same node, so query next portion immediately.
 +                       * Otherwise leave node suspended and fetch lazily.
 +                       */
 +                      if (combiner->merge_sort || combiner->probing_primary)
 +                      {
 +                              if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
 +                              if (pgxc_node_send_flush(conn) != 0)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
 +                              if (pgxc_node_receive(1, &conn, NULL))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed receive node from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
 +                              continue;
 +                      }
 +
 +                      /*
 +                       * Tell the node to fetch data in background, next loop when we 
 +                       * pgxc_node_receive, data is already there, so we can run faster
 +                       * */
 +                      if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (pgxc_node_send_flush(conn) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
 +                      }
 +
 +                      if (++combiner->current_conn >= combiner->conn_count)
 +                              combiner->current_conn = 0;
 +                      conn = combiner->connections[combiner->current_conn];
 +              }
 +              else if (res == RESPONSE_COMPLETE)
 +              {
 +                      /*
 +                       * In case of Simple Query Protocol we should receive ReadyForQuery
 +                       * before removing connection from the list. In case of Extended
 +                       * Query Protocol we may remove connection right away.
 +                       */
 +                      if (combiner->extended_query)
 +                      {
 +                              /* If we are doing merge sort clean current connection and return
 +                               * NULL, otherwise remove current connection, move last in-place,
 +                               * adjust current_conn and continue if it is not last connection */
 +                              if (combiner->merge_sort)
 +                              {
 +                                      combiner->connections[combiner->current_conn] = NULL;
 +                                      return NULL;
 +                              }
 +                              REMOVE_CURR_CONN(combiner);
 +                              if (combiner->conn_count > 0)
 +                                      conn = combiner->connections[combiner->current_conn];
 +                              else
 +                                      return NULL;
 +                      }
 +              }
 +              else if (res == RESPONSE_ERROR)
 +              {
 +                      /*
 +                       * If doing Extended Query Protocol we need to sync connection,
 +                       * otherwise subsequent commands will be ignored.
 +                       */
 +                      if (combiner->extended_query)
 +                      {
 +                              if (pgxc_node_send_sync(conn) != 0)
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to sync msg to node %u", conn->nodeoid)));
 +                      }
 +                      /*
 +                       * Do not wait for response from primary, it needs to wait
 +                       * for other nodes to respond. Instead go ahead and send query to
 +                       * other nodes. It will fail there, but we can continue with
 +                       * normal cleanup.
 +                       */
 +                      if (combiner->probing_primary)
 +                      {
 +                              REMOVE_CURR_CONN(combiner);
 +                              return NULL;
 +                      }
 +              }
 +              else if (res == RESPONSE_READY)
 +              {
 +                      /* If we are doing merge sort clean current connection and return
 +                       * NULL, otherwise remove current connection, move last in-place,
 +                       * adjust current_conn and continue if it is not last connection */
 +                      if (combiner->merge_sort)
 +                      {
 +                              combiner->connections[combiner->current_conn] = NULL;
 +                              return NULL;
 +                      }
 +                      REMOVE_CURR_CONN(combiner);
 +                      if (combiner->conn_count > 0)
 +                              conn = combiner->connections[combiner->current_conn];
 +                      else
 +                              return NULL;
 +              }
 +              else if (res == RESPONSE_TUPDESC)
 +              {
 +                      ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot,
 +                                                                combiner->tuple_desc);
 +                      /* Now slot is responsible for freeng the descriptor */
 +                      combiner->tuple_desc = NULL;
 +              }
 +              else if (res == RESPONSE_ASSIGN_GXID)
 +              {
 +                      /* Do nothing. It must have been handled in handle_response() */
 +              }
 +              else if (res == RESPONSE_WAITXIDS)
 +              {
 +                      /* Do nothing. It must have been handled in handle_response() */
 +              }
 +              else
 +              {
 +                      // Can not get here?
 +                      Assert(false);
 +              }
 +      }
 +
 +      return NULL;
 +}
 +
 +
 +/*
 + * Handle responses from the Datanode connections
 + */
 +static int
 +pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
 +                                               struct timeval * timeout, ResponseCombiner *combiner)
 +{
 +      int                     count = conn_count;
 +      PGXCNodeHandle *to_receive[conn_count];
 +
 +      /* make a copy of the pointers to the connections */
 +      memcpy(to_receive, connections, conn_count * sizeof(PGXCNodeHandle *));
 +
 +      /*
 +       * Read results.
 +       * Note we try and read from Datanode connections even if there is an error on one,
 +       * so as to avoid reading incorrect results on the next statement.
 +       * Other safegaurds exist to avoid this, however.
 +       */
 +      while (count > 0)
 +      {
 +              int i = 0;
 +
 +              if (pgxc_node_receive(count, to_receive, timeout))
 +                      return EOF;
 +              while (i < count)
 +              {
 +                      int result =  handle_response(to_receive[i], combiner);
 +                      elog(DEBUG5, "Received response %d on connection to node %s",
 +                                      result, to_receive[i]->nodename);
 +                      switch (result)
 +                      {
 +                              case RESPONSE_EOF: /* have something to read, keep receiving */
 +                                      i++;
 +                                      break;
 +                              case RESPONSE_COMPLETE:
 +                                      if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL)
 +                                              /* Continue read until ReadyForQuery */
 +                                              break;
 +                                      /* fallthru */
 +                              case RESPONSE_READY:
 +                                      /* fallthru */
 +                              case RESPONSE_COPY:
 +                                      /* Handling is done, do not track this connection */
 +                                      count--;
 +                                      /* Move last connection in place */
 +                                      if (i < count)
 +                                              to_receive[i] = to_receive[count];
 +                                      break;
 +                              case RESPONSE_ERROR:
 +                                      /* no handling needed, just wait for ReadyForQuery */
 +                                      break;
 +
 +                              case RESPONSE_WAITXIDS:
 +                              case RESPONSE_ASSIGN_GXID:
 +                              case RESPONSE_TUPDESC:
 +                                      break;
 +
 +                              case RESPONSE_DATAROW:
 +                                      combiner->currentRow = NULL;
 +                                      break;
 +
 +                              default:
 +                                      /* Inconsistent responses */
 +                                      add_error_message(to_receive[i], "Unexpected response from the Datanodes");
 +                                      elog(DEBUG1, "Unexpected response from the Datanodes, result = %d, request type %d", result, combiner->request_type);
 +                                      /* Stop tracking and move last connection in place */
 +                                      count--;
 +                                      if (i < count)
 +                                              to_receive[i] = to_receive[count];
 +                      }
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Read next message from the connection and update the combiner
 + * and connection state accordingly
 + * If we are in an error state we just consume the messages, and do not proxy
 + * Long term, we should look into cancelling executing statements
 + * and closing the connections.
 + * It returns if states need to be handled
 + * Return values:
 + * RESPONSE_EOF - need to receive more data for the connection
 + * RESPONSE_READY - got ReadyForQuery
 + * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
 + * Also this result is output in case of error
 + * RESPONSE_SUSPENDED - got PortalSuspended
 + * RESPONSE_TUPLEDESC - got tuple description
 + * RESPONSE_DATAROW - got data row
 + * RESPONSE_COPY - got copy response
 + * RESPONSE_BARRIER_OK - barrier command completed successfully
 + */
 +int
 +handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
 +{
 +      char       *msg;
 +      int                     msg_len;
 +      char            msg_type;
 +
 +      for (;;)
 +      {
 +              /*
 +               * If we are in the process of shutting down, we
 +               * may be rolling back, and the buffer may contain other messages.
 +               * We want to avoid a procarray exception
 +               * as well as an error stack overflow.
 +               */
 +              if (proc_exit_inprogress)
 +                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
 +
 +              /*
 +               * Don't read from from the connection if there is a fatal error.
 +               * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
 +               * Handling of RESPONSE_ERROR assumes sending SYNC message, but
 +               * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
 +               * not usable.
 +               */
 +              if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
 +                      return RESPONSE_COMPLETE;
 +
 +              /* No data available, exit */
 +              if (!HAS_MESSAGE_BUFFERED(conn))
 +                      return RESPONSE_EOF;
 +
 +              Assert(conn->combiner == combiner || conn->combiner == NULL);
 +
 +              /* TODO handle other possible responses */
 +              msg_type = get_message(conn, &msg_len, &msg);
 +              elog(DEBUG5, "handle_response - received message %c, node %s, "
 +                              "current_state %d", msg_type, conn->nodename, conn->state);
 +              switch (msg_type)
 +              {
 +                      case '\0':                      /* Not enough data in the buffer */
 +                              return RESPONSE_EOF;
 +                      case 'c':                       /* CopyToCommandComplete */
 +                              HandleCopyOutComplete(combiner);
 +                              break;
 +                      case 'C':                       /* CommandComplete */
 +                              HandleCommandComplete(combiner, msg, msg_len, conn);
 +                              conn->combiner = NULL;
 +                              /* 
 +                               * In case of simple query protocol, wait for the ReadyForQuery
 +                               * before marking connection as Idle
 +                               */
 +                              if (combiner->extended_query &&
 +                                      conn->state == DN_CONNECTION_STATE_QUERY)
 +                                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
 +                              return RESPONSE_COMPLETE;
 +                      case 'T':                       /* RowDescription */
 +#ifdef DN_CONNECTION_DEBUG
 +                              Assert(!conn->have_row_desc);
 +                              conn->have_row_desc = true;
 +#endif
 +                              if (HandleRowDescription(combiner, msg, msg_len))
 +                                      return RESPONSE_TUPDESC;
 +                              break;
 +                      case 'D':                       /* DataRow */
 +#ifdef DN_CONNECTION_DEBUG
 +                              Assert(conn->have_row_desc);
 +#endif
 +                              /* Do not return if data row has not been actually handled */
 +                              if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid))
 +                                      return RESPONSE_DATAROW;
 +                              break;
 +                      case 's':                       /* PortalSuspended */
 +                              /* No activity is expected on the connection until next query */
 +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
 +                              return RESPONSE_SUSPENDED;
 +                      case '1': /* ParseComplete */
 +                      case '2': /* BindComplete */
 +                      case '3': /* CloseComplete */
 +                      case 'n': /* NoData */
 +                              /* simple notifications, continue reading */
 +                              break;
 +                      case 'G': /* CopyInResponse */
 +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_IN);
 +                              HandleCopyIn(combiner);
 +                              /* Done, return to caller to let it know the data can be passed in */
 +                              return RESPONSE_COPY;
 +                      case 'H': /* CopyOutResponse */
 +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
 +                              HandleCopyOut(combiner);
 +                              return RESPONSE_COPY;
 +                      case 'd': /* CopyOutDataRow */
 +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
 +                              HandleCopyDataRow(combiner, msg, msg_len);
 +                              break;
 +                      case 'E':                       /* ErrorResponse */
 +                              HandleError(combiner, msg, msg_len, conn);
 +                              add_error_message(conn, combiner->errorMessage);
 +                              /*
 +                               * In case the remote node was running an extended query
 +                               * protocol and reported an error, it will keep ignoring all
 +                               * subsequent commands until it sees a SYNC message. So make
 +                               * sure that we send down SYNC even before sending a ROLLBACK
 +                               * command
 +                               */
 +                              if (conn->in_extended_query)
 +                                      conn->needSync = true;
 +                              return RESPONSE_ERROR;
 +                      case 'A':                       /* NotificationResponse */
 +                      case 'N':                       /* NoticeResponse */
 +                      case 'S':                       /* SetCommandComplete */
 +                              /*
 +                               * Ignore these to prevent multiple messages, one from each
 +                               * node. Coordinator will send one for DDL anyway
 +                               */
 +                              break;
 +                      case 'Z':                       /* ReadyForQuery */
 +                      {
 +                              /*
 +                               * Return result depends on previous connection state.
 +                               * If it was PORTAL_SUSPENDED Coordinator want to send down
 +                               * another EXECUTE to fetch more rows, otherwise it is done
 +                               * with the connection
 +                               */
 +                              conn->transaction_status = msg[0];
 +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
 +                              conn->combiner = NULL;
 +#ifdef DN_CONNECTION_DEBUG
 +                              conn->have_row_desc = false;
 +#endif
 +                              return RESPONSE_READY;
 +                      }
 +                      case 'M':                       /* Command Id */
 +                              HandleDatanodeCommandId(combiner, msg, msg_len);
 +                              break;
 +                      case 'b':
 +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
 +                              return RESPONSE_BARRIER_OK;
 +                      case 'I':                       /* EmptyQuery */
 +                              return RESPONSE_COMPLETE;
 +                      case 'W':
 +                              HandleWaitXids(msg, msg_len);   
 +                              return RESPONSE_WAITXIDS;
 +                      case 'x':
 +                              HandleGlobalTransactionId(msg, msg_len);
 +                              return RESPONSE_ASSIGN_GXID;
 +                      default:
 +                              /* sync lost? */
 +                              elog(WARNING, "Received unsupported message type: %c", msg_type);
 +                              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
 +                              /* stop reading */
 +                              return RESPONSE_COMPLETE;
 +              }
 +      }
 +      /* never happen, but keep compiler quiet */
 +      return RESPONSE_EOF;
 +}
 +
 +/*
 + * Has the data node sent Ready For Query
 + */
 +
 +bool
 +is_data_node_ready(PGXCNodeHandle * conn)
 +{
 +      char            *msg;
 +      int             msg_len;
 +      char            msg_type;
 +
 +      for (;;)
 +      {
 +              /*
 +               * If we are in the process of shutting down, we
 +               * may be rolling back, and the buffer may contain other messages.
 +               * We want to avoid a procarray exception
 +               * as well as an error stack overflow.
 +               */
 +              if (proc_exit_inprogress)
 +                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
 +
 +              /* don't read from from the connection if there is a fatal error */
 +              if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
 +                      return true;
 +
 +              /* No data available, exit */
 +              if (!HAS_MESSAGE_BUFFERED(conn))
 +                      return false;
 +
 +              msg_type = get_message(conn, &msg_len, &msg);
 +              if (msg_type == 'Z')
 +              {
 +                      /*
 +                       * Return result depends on previous connection state.
 +                       * If it was PORTAL_SUSPENDED Coordinator want to send down
 +                       * another EXECUTE to fetch more rows, otherwise it is done
 +                       * with the connection
 +                       */
 +                      conn->transaction_status = msg[0];
 +                      PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
 +                      conn->combiner = NULL;
 +                      return true;
 +              }
 +      }
 +      /* never happen, but keep compiler quiet */
 +      return false;
 +}
 +
 +
 +/*
 + * Send BEGIN command to the Datanodes or Coordinators and receive responses.
 + * Also send the GXID for the transaction.
 + */
 +static int
 +pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
 +                              GlobalTransactionId gxid, bool need_tran_block,
 +                              bool readOnly, char node_type)
 +{
 +      int                     i;
 +      struct timeval *timeout = NULL;
 +      ResponseCombiner combiner;
 +      TimestampTz timestamp = GetCurrentGTMStartTimestamp();
 +      PGXCNodeHandle *new_connections[conn_count];
 +      int new_count = 0;
 +      char               *init_str;
 +      char                    lxid[13];
 +
 +      /*
 +       * If no remote connections, we don't have anything to do
 +       */
 +      if (conn_count == 0)
 +              return 0;
 +
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              if (!readOnly && !IsConnFromDatanode())
 +                      connections[i]->read_only = false;
 +              /*
 +               * PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY
 +               * state when we are about to send a BEGIN TRANSACTION command to the
 +               * node. We should consider changing the following to an assert and fix
 +               * any bugs reported
 +               */
 +              if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
 +                      BufferConnection(connections[i]);
 +
 +              /* Send GXID and check for errors */
 +              if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid))
 +                      return EOF;
 +
 +              /* Send timestamp and check for errors */
 +              if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp))
 +                      return EOF;
 +
 +              if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid))
 +                      need_tran_block = true;
 +              else if (IS_PGXC_REMOTE_COORDINATOR)
 +                      need_tran_block = false;
 +
 +              elog(DEBUG5, "need_tran_block %d, connections[%d]->transaction_status %c",
 +                              need_tran_block, i, connections[i]->transaction_status);
 +              /* Send BEGIN if not already in transaction */
 +              if (need_tran_block && connections[i]->transaction_status == 'I')
 +              {
 +                      /* Send the BEGIN TRANSACTION command and check for errors */
 +                      if (pgxc_node_send_query(connections[i], "BEGIN"))
 +                              return EOF;
 +
 +                      new_connections[new_count++] = connections[i];
 +              }
 +      }
 +
 +      /*
 +       * If we did not send a BEGIN command to any node, we are done. Otherwise,
 +       * we need to check for any errors and report them
 +       */
 +      if (new_count == 0)
 +              return 0;
 +
 +      InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +
 +      /* Receive responses */
 +      if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner))
 +              return EOF;
 +
 +      /* Verify status */
 +      if (!ValidateAndCloseCombiner(&combiner))
 +              return EOF;
 +
 +      /* Send virtualXID to the remote nodes using SET command */
 +      sprintf(lxid, "%d", MyProc->lxid);
 +      PGXCNodeSetParam(true, "coordinator_lxid", lxid, 0);
 +
 +      /* after transactions are started send down local set commands */
 +      init_str = PGXCNodeGetTransactionParamStr();
 +      if (init_str)
 +      {
 +              for (i = 0; i < new_count; i++)
 +              {
 +                      pgxc_node_set_query(new_connections[i], init_str);
 +              }
 +      }
 +
 +      /* No problem, let's get going */
 +      return 0;
 +}
 +
 +
 +/*
 + * Execute DISCARD ALL command on all allocated nodes to remove all session
 + * specific stuff before releasing them to pool for reuse by other sessions.
 + */
 +static void
 +pgxc_node_remote_cleanup_all(void)
 +{
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +      PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
 +      int                             new_conn_count = 0;
 +      int                             i;
 +      char               *resetcmd = "RESET ALL;"
 +                                                         "RESET SESSION AUTHORIZATION;"
 +                                                         "RESET transaction_isolation;"
 +                                                         "RESET global_session";
 +
 +      elog(DEBUG5, "pgxc_node_remote_cleanup_all - handles->co_conn_count %d,"
 +                      "handles->dn_conn_count %d", handles->co_conn_count,
 +                      handles->dn_conn_count);
 +      /*
 +       * We must handle reader and writer connections both since even a read-only
 +       * needs to be cleaned up.
 +       */
 +      if (handles->co_conn_count + handles->dn_conn_count == 0)
 +              return;
 +
 +      /*
 +       * Send down snapshot followed by DISCARD ALL command.
 +       */
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = handles->coord_handles[i];
 +
 +              /* At this point connection should be in IDLE state */
 +              if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              {
 +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
 +                      continue;
 +              }
 +
 +              /*
 +               * We must go ahead and release connections anyway, so do not throw
 +               * an error if we have a problem here.
 +               */
 +              if (pgxc_node_send_query(handle, resetcmd))
 +              {
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to clean up data nodes")));
 +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
 +                      continue;
 +              }
 +              new_connections[new_conn_count++] = handle;
 +              handle->combiner = NULL;
 +      }
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = handles->datanode_handles[i];
 +
 +              /* At this point connection should be in IDLE state */
 +              if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              {
 +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
 +                      continue;
 +              }
 +
 +              /*
 +               * We must go ahead and release connections anyway, so do not throw
 +               * an error if we have a problem here.
 +               */
 +              if (pgxc_node_send_query(handle, resetcmd))
 +              {
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to clean up data nodes")));
 +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
 +                      continue;
 +              }
 +              new_connections[new_conn_count++] = handle;
 +              handle->combiner = NULL;
 +      }
 +
 +      if (new_conn_count)
 +      {
 +              ResponseCombiner combiner;
 +              InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner);
 +              CloseCombiner(&combiner);
 +      }
 +      pfree_pgxc_all_handles(handles);
 +}
 +
 +/*
 + * Count how many coordinators and datanodes are involved in this transaction
 + * so that we can save that information in the GID
 + */
 +static void
 +pgxc_node_remote_count(int *dnCount, int dnNodeIds[],
 +              int *coordCount, int coordNodeIds[])
 +{
 +      int i;
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +
 +      *dnCount = *coordCount = 0;
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +              /*
 +               * Skip empty slots
 +               */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +              else if (conn->transaction_status == 'T')
 +              {
 +                      if (!conn->read_only)
 +                      {
 +                              dnNodeIds[*dnCount] = conn->nodeid;
 +                              *dnCount = *dnCount + 1;
 +                      }
 +              }
 +      }
 +
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +              /*
 +               * Skip empty slots
 +               */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +              else if (conn->transaction_status == 'T')
 +              {
 +                      if (!conn->read_only)
 +                      {
 +                              coordNodeIds[*coordCount] = conn->nodeid;
 +                              *coordCount = *coordCount + 1;
 +                      }
 +              }
 +      }
 +}
 +
 +/*
 + * Prepare nodes which ran write operations during the transaction.
 + * Read only remote transactions are committed and connections are released
 + * back to the pool.
 + * Function returns the list of nodes where transaction is prepared, including
 + * local node, if requested, in format expected by the GTM server.
 + * If something went wrong the function tries to abort prepared transactions on
 + * the nodes where it succeeded and throws error. A warning is emitted if abort
 + * prepared fails.
 + * After completion remote connection handles are released.
 + */
 +static char *
 +pgxc_node_remote_prepare(char *prepareGID, bool localNode)
 +{
 +      bool                    isOK = true;
 +      StringInfoData  nodestr;
 +      char                    *prepare_cmd = (char *) palloc (64 + strlen(prepareGID));
 +      char                    *abort_cmd;
 +      GlobalTransactionId auxXid;
 +      char               *commit_cmd = "COMMIT TRANSACTION";
 +      int                             i;
 +      ResponseCombiner combiner;
 +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
 +      int                             conn_count = 0;
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +
 +      initStringInfo(&nodestr);
 +      if (localNode)
 +              appendStringInfoString(&nodestr, PGXCNodeName);
 +
 +      sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID);
 +
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /*
 +               * If something went wrong already we have nothing to do here. The error
 +               * will be reported at the end of the function, and we will rollback
 +               * remotes as part of the error handling.
 +               * Just skip to clean up section and check if we have already prepared
 +               * somewhere, we should abort that prepared transaction.
 +               */
 +              if (!isOK)
 +                      goto prepare_err;
 +
 +              /*
 +               * Skip empty slots
 +               */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +              else if (conn->transaction_status == 'T')
 +              {
 +                      /* Read in any pending input */
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                              BufferConnection(conn);
 +
 +                      if (conn->read_only)
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, commit_cmd))
 +                              {
 +                                      /*
 +                                       * not a big deal, it was read only, the connection will be
 +                                       * abandoned later.
 +                                       */
 +                                      ereport(LOG,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send COMMIT command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, prepare_cmd))
 +                              {
 +                                      /*
 +                                       * That is the trouble, we really want to prepare it.
 +                                       * Just emit warning so far and go to clean up.
 +                                       */
 +                                      isOK = false;
 +                                      ereport(WARNING,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send PREPARE TRANSACTION command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      char *nodename = get_pgxc_nodename(conn->nodeoid);
 +                                      if (nodestr.len > 0)
 +                                              appendStringInfoChar(&nodestr, ',');
 +                                      appendStringInfoString(&nodestr, nodename);
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                                      /*
 +                                       * If it fails on remote node it would just return ROLLBACK.
 +                                       * Set the flag for the message handler so the response is
 +                                       * verified.
 +                                       */
 +                                      conn->ck_resp_rollback = true;
 +                              }
 +                      }
 +              }
 +              else if (conn->transaction_status == 'E')
 +              {
 +                      /*
 +                       * Probably can not happen, if there was a error the engine would
 +                       * abort anyway, even in case of explicit PREPARE.
 +                       * Anyway, just in case...
 +                       */
 +                      isOK = false;
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("remote node %u is in error state", conn->nodeoid)));
 +              }
 +      }
 +
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              /*
 +               * If something went wrong already we have nothing to do here. The error
 +               * will be reported at the end of the function, and we will rollback
 +               * remotes as part of the error handling.
 +               * Just skip to clean up section and check if we have already prepared
 +               * somewhere, we should abort that prepared transaction.
 +               */
 +              if (!isOK)
 +                      goto prepare_err;
 +
 +              /*
 +               * Skip empty slots
 +               */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +              else if (conn->transaction_status == 'T')
 +              {
 +                      if (conn->read_only)
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, commit_cmd))
 +                              {
 +                                      /*
 +                                       * not a big deal, it was read only, the connection will be
 +                                       * abandoned later.
 +                                       */
 +                                      ereport(LOG,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send COMMIT command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                              }
 +                      }
 +                      else
 +                      {
 +                              /* Send down prepare command */
 +                              if (pgxc_node_send_query(conn, prepare_cmd))
 +                              {
 +                                      /*
 +                                       * That is the trouble, we really want to prepare it.
 +                                       * Just emit warning so far and go to clean up.
 +                                       */
 +                                      isOK = false;
 +                                      ereport(WARNING,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("failed to send PREPARE TRANSACTION command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                              }
 +                              else
 +                              {
 +                                      char *nodename = get_pgxc_nodename(conn->nodeoid);
 +                                      if (nodestr.len > 0)
 +                                              appendStringInfoChar(&nodestr, ',');
 +                                      appendStringInfoString(&nodestr, nodename);
 +                                      /* Read responses from these */
 +                                      connections[conn_count++] = conn;
 +                                      /*
 +                                       * If it fails on remote node it would just return ROLLBACK.
 +                                       * Set the flag for the message handler so the response is
 +                                       * verified.
 +                                       */
 +                                      conn->ck_resp_rollback = true;
 +                              }
 +                      }
 +              }
 +              else if (conn->transaction_status == 'E')
 +              {
 +                      /*
 +                       * Probably can not happen, if there was a error the engine would
 +                       * abort anyway, even in case of explicit PREPARE.
 +                       * Anyway, just in case...
 +                       */
 +                      isOK = false;
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("remote node %u is in error state", conn->nodeoid)));
 +              }
 +      }
 +
 +      SetSendCommandId(false);
 +
 +      if (!isOK)
 +              goto prepare_err;
 +
 +      /* exit if nothing has been prepared */
 +      if (conn_count > 0)
 +      {
 +              int result;
 +              /*
 +               * Receive and check for any errors. In case of errors, we don't bail out
 +               * just yet. We first go through the list of connections and look for
 +               * errors on each connection. This is important to ensure that we run
 +               * an appropriate ROLLBACK command later on (prepared transactions must be
 +               * rolled back with ROLLBACK PREPARED commands).
 +               *
 +               * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on
 +               * individual connections. The transaction_status field doesn't get set
 +               * every time there is an error on the connection. The combiner mechanism is
 +               * good for parallel proessing, but I think we should have a leak-proof
 +               * mechanism to track connection status
 +               */
 +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
 +              if (result || !validate_combiner(&combiner))
 +                      goto prepare_err;
 +              else
 +                      CloseCombiner(&combiner);
 +
 +              /* Before exit clean the flag, to avoid unnecessary checks */
 +              for (i = 0; i < conn_count; i++)
 +                      connections[i]->ck_resp_rollback = false;
 +
 +              pfree_pgxc_all_handles(handles);
 +              if (!temp_object_included && !PersistentConnections)
 +              {
 +                      /* Clean up remote sessions */
 +                      pgxc_node_remote_cleanup_all();
 +                      release_handles();
 +              }
 +      }
 +
 +      pfree(prepare_cmd);
 +      return nodestr.data;
 +
 +prepare_err:
 +      abort_cmd = (char *) palloc (64 + strlen(prepareGID));
 +      sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
 +
 +      auxXid = GetAuxilliaryTransactionId();
 +      conn_count = 0;
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /*
 +               * PREPARE succeeded on that node, roll it back there
 +               */
 +              if (conn->ck_resp_rollback)
 +              {
 +                      conn->ck_resp_rollback = false;
 +
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                      {
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Error while PREPARING transaction %s on "
 +                                                       "node %s. Administrative action may be required "
 +                                                       "to abort this transaction on the node",
 +                                                       prepareGID, conn->nodename)));
 +                              continue;
 +                      }
 +
 +                      /* sanity checks */
 +                      Assert(conn->sock != NO_SOCKET);
 +                      /* Send down abort prepared command */
 +                      if (pgxc_node_send_gxid(conn, auxXid))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send xid to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      if (pgxc_node_send_query(conn, abort_cmd))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send ABORT PREPARED command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              if (conn->ck_resp_rollback)
 +              {
 +                      conn->ck_resp_rollback = false;
 +
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                      {
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Error while PREPARING transaction %s on "
 +                                                       "node %s. Administrative action may be required "
 +                                                       "to abort this transaction on the node",
 +                                                       prepareGID, conn->nodename)));
 +                              continue;
 +                      }
 +
 +                      /* sanity checks */
 +                      Assert(conn->sock != NO_SOCKET);
 +                      /* Send down abort prepared command */
 +                      if (pgxc_node_send_gxid(conn, auxXid))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send xid to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      if (pgxc_node_send_query(conn, abort_cmd))
 +                      {
 +                              /*
 +                               * Prepared transaction is left on the node, but we can not
 +                               * do anything with that except warn the user.
 +                               */
 +                              ereport(WARNING,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send ABORT PREPARED command to "
 +                                                              "the node %u", conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +      if (conn_count > 0)
 +      {
 +              /* Just read out responses, throw error from the first combiner */
 +              ResponseCombiner combiner2;
 +              InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2);
 +              CloseCombiner(&combiner2);
 +      }
 +
 +      if (!temp_object_included && !PersistentConnections)
 +      {
 +              /* Clean up remote sessions */
 +              pgxc_node_remote_cleanup_all();
 +              release_handles();
 +      }
 +
 +      pfree_pgxc_all_handles(handles);
 +      pfree(abort_cmd);
 +
 +      /*
 +       * If the flag is set we are here because combiner carries error message
 +       */
 +      if (isOK)
 +              pgxc_node_report_error(&combiner);
 +      else
 +              elog(ERROR, "failed to PREPARE transaction on one or more nodes");
 +      return NULL;
 +}
 +
 +
 +/*
 + * Commit transactions on remote nodes.
 + * If barrier lock is set wait while it is released.
 + * Release remote connection after completion.
 + */
 +static void
 +pgxc_node_remote_commit(void)
 +{
 +      int                             result = 0;
 +      char               *commitCmd = "COMMIT TRANSACTION";
 +      int                             i;
 +      ResponseCombiner combiner;
 +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
 +      int                             conn_count = 0;
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +
 +      SetSendCommandId(false);
 +
 +      /*
 +       * Barrier:
 +       *
 +       * We should acquire the BarrierLock in SHARE mode here to ensure that
 +       * there are no in-progress barrier at this point. This mechanism would
 +       * work as long as LWLock mechanism does not starve a EXCLUSIVE lock
 +       * requester
 +       */
 +      LWLockAcquire(BarrierLock, LW_SHARED);
 +
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              /*
 +               * We do not need to commit remote node if it is not in transaction.
 +               * If transaction is in error state the commit command will cause
 +               * rollback, that is OK
 +               */
 +              if (conn->transaction_status != 'I')
 +              {
 +                      /* Read in any pending input */
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                              BufferConnection(conn);
 +
 +                      if (pgxc_node_send_query(conn, commitCmd))
 +                      {
 +                              /*
 +                               * Do not bother with clean up, just bomb out. The error handler
 +                               * will invoke RollbackTransaction which will do the work.
 +                               */
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send COMMIT command to the node %u",
 +                                                              conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              /*
 +               * We do not need to commit remote node if it is not in transaction.
 +               * If transaction is in error state the commit command will cause
 +               * rollback, that is OK
 +               */
 +              if (conn->transaction_status != 'I')
 +              {
 +                      if (pgxc_node_send_query(conn, commitCmd))
 +                      {
 +                              /*
 +                               * Do not bother with clean up, just bomb out. The error handler
 +                               * will invoke RollbackTransaction which will do the work.
 +                               */
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("failed to send COMMIT command to the node %u",
 +                                                              conn->nodeoid)));
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Release the BarrierLock.
 +       */
 +      LWLockRelease(BarrierLock);
 +
 +      if (conn_count)
 +      {
 +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
 +              if (result || !validate_combiner(&combiner))
 +                      result = EOF;
 +              else
 +                      CloseCombiner(&combiner);
 +      }
 +
 +      stat_transaction(conn_count);
 +
 +      if (result)
 +      {
 +              if (combiner.errorMessage)
 +                      pgxc_node_report_error(&combiner);
 +              else
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to COMMIT the transaction on one or more nodes")));
 +      }
 +
 +      if (!temp_object_included && !PersistentConnections)
 +      {
 +              /* Clean up remote sessions */
 +              pgxc_node_remote_cleanup_all();
 +              release_handles();
 +      }
 +
 +      pfree_pgxc_all_handles(handles);
 +}
 +
 +
 +/*
 + * Rollback transactions on remote nodes.
 + * Release remote connection after completion.
 + */
 +static void
 +pgxc_node_remote_abort(void)
 +{
 +      int                             result = 0;
 +      char               *rollbackCmd = "ROLLBACK TRANSACTION";
 +      int                             i;
 +      ResponseCombiner combiner;
 +      PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
 +      int                             conn_count = 0;
 +      PGXCNodeAllHandles *handles = get_current_handles();
 +      struct timeval timeout;
 +
 +      SetSendCommandId(false);
 +
 +      elog(DEBUG5, "pgxc_node_remote_abort - dn_conn_count %d, co_conn_count %d",
 +                      handles->dn_conn_count, handles->co_conn_count);
 +
 +      timeout.tv_sec = 60;
 +      timeout.tv_usec = 0;
 +
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              elog(DEBUG5, "node %s, conn->transaction_status %c",
 +                              conn->nodename,
 +                              conn->transaction_status);
 +
 +              if (conn->transaction_status != 'I')
 +              {
 +                      /* Read in any pending input */
 +                      if (conn->state != DN_CONNECTION_STATE_IDLE)
 +                              BufferConnection(conn);
 +
 +                      /*
 +                       * If the remote session was running extended query protocol when
 +                       * it failed, it will expect a SYNC message before it accepts any
 +                       * other command
 +                       */
 +                      if (conn->needSync)
 +                      {
 +                              pgxc_node_send_sync(conn);
 +                              pgxc_node_receive(1, &conn, &timeout);
 +                      }
 +                      /*
 +                       * Do not matter, is there committed or failed transaction,
 +                       * just send down rollback to finish it.
 +                       */
 +                      if (pgxc_node_send_rollback(conn, rollbackCmd))
 +                      {
 +                              add_error_message(conn,
 +                                              "failed to send ROLLBACK TRANSACTION command");
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +
 +              /* Skip empty slots */
 +              if (conn->sock == NO_SOCKET)
 +                      continue;
 +
 +              if (conn->transaction_status != 'I')
 +              {
 +                      /* Send SYNC if the remote session is expecting one */
 +                      if (conn->needSync)
 +                      {
 +                              pgxc_node_send_sync(conn);
 +                              pgxc_node_receive(1, &conn, &timeout);
 +                      }
 +                      /*
 +                       * Do not matter, is there committed or failed transaction,
 +                       * just send down rollback to finish it.
 +                       */
 +                      if (pgxc_node_send_rollback(conn, rollbackCmd))
 +                      {
 +                              add_error_message(conn,
 +                                              "failed to send ROLLBACK TRANSACTION command");
 +                      }
 +                      else
 +                      {
 +                              /* Read responses from these */
 +                              connections[conn_count++] = conn;
 +                      }
 +              }
 +      }
 +
 +      if (conn_count)
 +      {
 +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              result = pgxc_node_receive_responses(conn_count, connections, &timeout, &combiner);
 +              if (result || !validate_combiner(&combiner))
 +                      result = EOF;
 +              else
 +                      CloseCombiner(&combiner);
 +      }
 +
 +      stat_transaction(conn_count);
 +
 +      if (result)
 +      {
 +              if (combiner.errorMessage)
 +                      pgxc_node_report_error(&combiner);
 +              else
 +                      ereport(LOG,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to ROLLBACK the transaction on one or more nodes")));
 +      }
 +
 +      pfree_pgxc_all_handles(handles);
 +}
 +
 +/*
 + * Begin COPY command
 + * The copy_connections array must have room for NumDataNodes items
 + */
 +void
 +DataNodeCopyBegin(RemoteCopyData *rcstate)
 +{
 +      int i;
 +      List *nodelist = rcstate->rel_loc->rl_nodeList;
 +      PGXCNodeHandle **connections;
 +      bool need_tran_block;
 +      GlobalTransactionId gxid;
 +      ResponseCombiner combiner;
 +      Snapshot snapshot = GetActiveSnapshot();
 +      int conn_count = list_length(nodelist);
 +
 +      /* Get needed datanode connections */
 +      if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType))
 +      {
 +              /* Connections is a single handle to read from */
 +              connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
 +              connections[0] = get_any_handle(nodelist);
 +              conn_count = 1;
 +      }
 +      else
 +      {
 +              PGXCNodeAllHandles *pgxc_handles;
 +              pgxc_handles = get_handles(nodelist, NULL, false, true);
 +              connections = pgxc_handles->datanode_handles;
 +              Assert(pgxc_handles->dn_conn_count == conn_count);
 +              pfree(pgxc_handles);
 +      }
 +
 +      /*
 +       * If more than one nodes are involved or if we are already in a
 +       * transaction block, we must the remote statements in a transaction block
 +       */
 +      need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T');
 +
 +      elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count,
 +                      need_tran_block ? "true" : "false");
 +
 +      /* Gather statistics */
 +      stat_statement();
 +      stat_transaction(conn_count);
 +
 +      gxid = GetCurrentTransactionId();
 +
 +      /* Start transaction on connections where it is not started */
 +      if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Could not begin transaction on data nodes.")));
 +      }
 +
 +      /*
 +       * COPY TO do not use locator, it just takes connections from it, and
 +       * we do not look up distribution data type in this case.
 +       * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not
 +       * defined partType if real locator type is HASH or MODULO.
 +       * Create locator before sending down query, because createLocator may
 +       * fail and we leave with dirty connections.
 +       * If we get an error now datanode connection will be clean and error
 +       * handler will issue transaction abort.
 +       */
 +      rcstate->locator = createLocator(
 +                      rcstate->is_from ? rcstate->rel_loc->locatorType
 +                                      : LOCATOR_TYPE_RROBIN,
 +                      rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ,
 +                      rcstate->dist_type,
 +                      LOCATOR_LIST_POINTER,
 +                      conn_count,
 +                      (void *) connections,
 +                      NULL,
 +                      false);
 +
 +      /* Send query to nodes */
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              CHECK_OWNERSHIP(connections[i], NULL);
 +
 +              if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot))
 +              {
 +                      add_error_message(connections[i], "Can not send request");
 +                      pfree(connections);
 +                      freeLocator(rcstate->locator);
 +                      rcstate->locator = NULL;
 +                      return;
 +              }
 +              if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0)
 +              {
 +                      add_error_message(connections[i], "Can not send request");
 +                      pfree(connections);
 +                      freeLocator(rcstate->locator);
 +                      rcstate->locator = NULL;
 +                      return;
 +              }
 +      }
 +
 +      /*
 +       * We are expecting CopyIn response, but do not want to send it to client,
 +       * caller should take care about this, because here we do not know if
 +       * client runs console or file copy
 +       */
 +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +
 +      /* Receive responses */
 +      if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner)
 +                      || !ValidateAndCloseCombiner(&combiner))
 +      {
 +              DataNodeCopyFinish(conn_count, connections);
 +              freeLocator(rcstate->locator);
 +              rcstate->locator = NULL;
 +              return;
 +      }
 +      pfree(connections);
 +}
 +
 +
 +/*
 + * Send a data row to the specified nodes
 + */
 +int
 +DataNodeCopyIn(char *data_row, int len,
 +              int conn_count, PGXCNodeHandle** copy_connections,
 +              bool binary)
 +{
 +      /* size + data row + \n in CSV mode */
 +      int msgLen = 4 + len + (binary ? 0 : 1);
 +      int nLen = htonl(msgLen);
 +      int i;
 +
 +      for(i = 0; i < conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = copy_connections[i];
 +              if (handle->state == DN_CONNECTION_STATE_COPY_IN)
 +              {
 +                      /* precalculate to speed up access */
 +                      int bytes_needed = handle->outEnd + 1 + msgLen;
 +
 +                      /* flush buffer if it is almost full */
 +                      if (bytes_needed > COPY_BUFFER_SIZE)
 +                      {
 +                              int to_send = handle->outEnd;
 +
 +                              /* First look if data node has sent a error message */
 +                              int read_status = pgxc_node_read_data(handle, true);
 +                              if (read_status == EOF || read_status < 0)
 +                              {
 +                                      add_error_message(handle, "failed to read data from data node");
 +                                      return EOF;
 +                              }
 +
 +                              if (handle->inStart < handle->inEnd)
 +                              {
 +                                      ResponseCombiner combiner;
 +                                      InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
 +                                      /*
 +                                       * Make sure there are zeroes in unused fields
 +                                       */
 +                                      memset(&combiner, 0, sizeof(ScanState));
 +
 +                                      /*
 +                                       * Validate the combiner but only if we see a proper
 +                                       * resposne for our COPY message. The problem is that
 +                                       * sometimes we might receive async messages such as
 +                                       * 'M' which is used to send back command ID generated and
 +                                       * consumed by the datanode. While the message gets handled
 +                                       * in handle_response(), we don't want to declare receipt
 +                                       * of an invalid message below.
 +                                       *
 +                                       * If there is an actual error of some sort then the
 +                                       * connection state is will be set appropriately and we
 +                                       * shall catch that subsequently.
 +                                       */
 +                                      if (handle_response(handle, &combiner) == RESPONSE_COPY &&
 +                                              !ValidateAndCloseCombiner(&combiner))
 +                                              return EOF;
 +                              }
 +
 +                              if (DN_CONNECTION_STATE_ERROR(handle))
 +                                      return EOF;
 +
 +                              /*
 +                               * Try to send down buffered data if we have
 +                               */
 +                              if (to_send && send_some(handle, to_send) < 0)
 +                              {
 +                                      add_error_message(handle, "failed to send data to data node");
 +                                      return EOF;
 +                              }
 +                      }
 +
 +                      if (ensure_out_buffer_capacity(bytes_needed, handle) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                               errmsg("out of memory")));
 +                      }
 +
 +                      handle->outBuffer[handle->outEnd++] = 'd';
 +                      memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
 +                      handle->outEnd += 4;
 +                      memcpy(handle->outBuffer + handle->outEnd, data_row, len);
 +                      handle->outEnd += len;
 +                      if (!binary)
 +                              handle->outBuffer[handle->outEnd++] = '\n';
 +
 +                      handle->in_extended_query = false;
 +              }
 +              else
 +              {
 +                      add_error_message(handle, "Invalid data node connection");
 +                      return EOF;
 +              }
 +      }
 +      return 0;
 +}
 +
 +uint64
 +DataNodeCopyOut(PGXCNodeHandle** copy_connections,
 +                                                        int conn_count, FILE* copy_file)
 +{
 +      ResponseCombiner combiner;
 +      uint64          processed;
 +      bool            error;
 +
 +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +      combiner.processed = 0;
 +      /* If there is an existing file where to copy data, pass it to combiner */
 +      if (copy_file)
 +      {
 +              combiner.copy_file = copy_file;
 +              combiner.remoteCopyType = REMOTE_COPY_FILE;
 +      }
 +      else
 +      {
 +              combiner.copy_file = NULL;
 +              combiner.remoteCopyType = REMOTE_COPY_STDOUT;
 +      }
 +      error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
 +
 +      processed = combiner.processed;
 +
 +      if (!ValidateAndCloseCombiner(&combiner) || error)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
 +      }
 +
 +      return processed;
 +}
 +
 +
 +uint64
 +DataNodeCopyStore(PGXCNodeHandle** copy_connections,
 +                                                              int conn_count, Tuplestorestate* store)
 +{
 +      ResponseCombiner combiner;
 +      uint64          processed;
 +      bool            error;
 +
 +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +      combiner.processed = 0;
 +      combiner.remoteCopyType = REMOTE_COPY_TUPLESTORE;
 +      combiner.tuplestorestate = store;
 +
 +      error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
 +
 +      processed = combiner.processed;
 +
 +      if (!ValidateAndCloseCombiner(&combiner) || error)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_DATA_CORRUPTED),
 +                               errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
 +      }
 +
 +      return processed;
 +}
 +
 +
 +/*
 + * Finish copy process on all connections
 + */
 +void
 +DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections)
 +{
 +      int             i;
 +      ResponseCombiner combiner;
 +      bool            error = false;
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = connections[i];
 +
 +              error = true;
 +              if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT)
 +                      error = DataNodeCopyEnd(handle, false);
 +      }
 +
 +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +      error = (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) != 0) || error;
 +
 +      if (!validate_combiner(&combiner) || error)
 +      {
 +              if (combiner.errorMessage)
 +                      pgxc_node_report_error(&combiner);
 +              else
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Error while running COPY")));
 +      }
 +      else
 +              CloseCombiner(&combiner);
 +}
 +
 +/*
 + * End copy process on a connection
 + */
 +bool
 +DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error)
 +{
 +      int             nLen = htonl(4);
 +
 +      if (handle == NULL)
 +              return true;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0)
 +              return true;
 +
 +      if (is_error)
 +              handle->outBuffer[handle->outEnd++] = 'f';
 +      else
 +              handle->outBuffer[handle->outEnd++] = 'c';
 +
 +      memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
 +      handle->outEnd += 4;
 +
 +      handle->in_extended_query = false;
 +      /* We need response right away, so send immediately */
 +      if (pgxc_node_flush(handle) < 0)
 +              return true;
 +
 +      return false;
 +}
 +
 +
 +/*
 + * Get Node connections depending on the connection type:
 + * Datanodes Only, Coordinators only or both types
 + */
 +static PGXCNodeAllHandles *
 +get_exec_connections(RemoteQueryState *planstate,
 +                                       ExecNodes *exec_nodes,
 +                                       RemoteQueryExecType exec_type,
 +                                       bool is_global_session)
 +{
 +      List       *nodelist = NIL;
 +      List       *primarynode = NIL;
 +      List       *coordlist = NIL;
 +      PGXCNodeHandle *primaryconnection;
 +      int                     co_conn_count, dn_conn_count;
 +      bool            is_query_coord_only = false;
 +      PGXCNodeAllHandles *pgxc_handles = NULL;
 +
 +      /*
 +       * If query is launched only on Coordinators, we have to inform get_handles
 +       * not to ask for Datanode connections even if list of Datanodes is NIL.
 +       */
 +      if (exec_type == EXEC_ON_COORDS)
 +              is_query_coord_only = true;
 +
 +      if (exec_type == EXEC_ON_CURRENT)
 +              return get_current_handles();
 +
 +      if (exec_nodes)
 +      {
 +              if (exec_nodes->en_expr)
 +              {
 +                      /* execution time determining of target Datanodes */
 +                      bool isnull;
 +                      ExprState *estate = ExecInitExpr(exec_nodes->en_expr,
 +                                                                                       (PlanState *) planstate);
 +                      Datum partvalue = ExecEvalExpr(estate,
 +                                                                                 planstate->combiner.ss.ps.ps_ExprContext,
-       if (IS_PGXC_LOCAL_COORDINATOR && MyXactAccessedTempRel)
++                                                                                 &isnull);
 +                      RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
 +                      /* PGXCTODO what is the type of partvalue here */
 +                      ExecNodes *nodes = GetRelationNodes(rel_loc_info,
 +                                                                                              partvalue,
 +                                                                                              isnull,
 +                                                                                              exec_nodes->accesstype);
 +                      /*
 +                       * en_expr is set by pgxc_set_en_expr only for distributed
 +                       * relations while planning DMLs, hence a select for update
 +                       * on a replicated table here is an assertion
 +                       */
 +                      Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE &&
 +                                              IsRelationReplicated(rel_loc_info)));
 +
 +                      if (nodes)
 +                      {
 +                              nodelist = nodes->nodeList;
 +                              primarynode = nodes->primarynodelist;
 +                              pfree(nodes);
 +                      }
 +                      FreeRelationLocInfo(rel_loc_info);
 +              }
 +              else if (OidIsValid(exec_nodes->en_relid))
 +              {
 +                      RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
 +                      ExecNodes *nodes = GetRelationNodes(rel_loc_info, 0, true, exec_nodes->accesstype);
 +
 +                      /*
 +                       * en_relid is set only for DMLs, hence a select for update on a
 +                       * replicated table here is an assertion
 +                       */
 +                      Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE &&
 +                                              IsRelationReplicated(rel_loc_info)));
 +
 +                      /* Use the obtained list for given table */
 +                      if (nodes)
 +                              nodelist = nodes->nodeList;
 +
 +                      /*
 +                       * Special handling for ROUND ROBIN distributed tables. The target
 +                       * node must be determined at the execution time
 +                       */
 +                      if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN && nodes)
 +                      {
 +                              nodelist = nodes->nodeList;
 +                              primarynode = nodes->primarynodelist;
 +                      }
 +                      else if (nodes)
 +                      {
 +                              if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
 +                              {
 +                                      nodelist = exec_nodes->nodeList;
 +                                      primarynode = exec_nodes->primarynodelist;
 +                              }
 +                      }
 +
 +                      if (nodes)
 +                              pfree(nodes);
 +                      FreeRelationLocInfo(rel_loc_info);
 +              }
 +              else
 +              {
 +                      if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
 +                              nodelist = exec_nodes->nodeList;
 +                      else if (exec_type == EXEC_ON_COORDS)
 +                              coordlist = exec_nodes->nodeList;
 +
 +                      primarynode = exec_nodes->primarynodelist;
 +              }
 +      }
 +
 +      /* Set node list and DN number */
 +      if (list_length(nodelist) == 0 &&
 +              (exec_type == EXEC_ON_ALL_NODES ||
 +               exec_type == EXEC_ON_DATANODES))
 +      {
 +              /* Primary connection is included in this number of connections if it exists */
 +              dn_conn_count = NumDataNodes;
 +      }
 +      else
 +      {
 +              if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
 +              {
 +                      if (primarynode)
 +                              dn_conn_count = list_length(nodelist) + 1;
 +                      else
 +                              dn_conn_count = list_length(nodelist);
 +              }
 +              else
 +                      dn_conn_count = 0;
 +      }
 +
 +      /* Set Coordinator list and Coordinator number */
 +      if ((list_length(nodelist) == 0 && exec_type == EXEC_ON_ALL_NODES) ||
 +              (list_length(coordlist) == 0 && exec_type == EXEC_ON_COORDS))
 +      {
 +              coordlist = GetAllCoordNodes();
 +              co_conn_count = list_length(coordlist);
 +      }
 +      else
 +      {
 +              if (exec_type == EXEC_ON_COORDS)
 +                      co_conn_count = list_length(coordlist);
 +              else
 +                      co_conn_count = 0;
 +      }
 +
 +      /* Get other connections (non-primary) */
 +      pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session);
 +      if (!pgxc_handles)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Could not obtain connection from pool")));
 +
 +      /* Get connection for primary node, if used */
 +      if (primarynode)
 +      {
 +              /* Let's assume primary connection is always a Datanode connection for the moment */
 +              PGXCNodeAllHandles *pgxc_conn_res;
 +              pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session);
 +
 +              /* primary connection is unique */
 +              primaryconnection = pgxc_conn_res->datanode_handles[0];
 +
 +              pfree(pgxc_conn_res);
 +
 +              if (!primaryconnection)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Could not obtain connection from pool")));
 +              pgxc_handles->primary_handle = primaryconnection;
 +      }
 +
 +      /* Depending on the execution type, we still need to save the initial node counts */
 +      pgxc_handles->dn_conn_count = dn_conn_count;
 +      pgxc_handles->co_conn_count = co_conn_count;
 +
 +      return pgxc_handles;
 +}
 +
 +
 +static bool
 +pgxc_start_command_on_connection(PGXCNodeHandle *connection,
 +                                                                      RemoteQueryState *remotestate,
 +                                                                      Snapshot snapshot)
 +{
 +      CommandId       cid;
 +      ResponseCombiner *combiner = (ResponseCombiner *) remotestate;
 +      RemoteQuery     *step = (RemoteQuery *) combiner->ss.ps.plan;
 +      CHECK_OWNERSHIP(connection, combiner);
 +
 +      elog(DEBUG5, "pgxc_start_command_on_connection - node %s, state %d",
 +                      connection->nodename, connection->state);
 +
 +      /*
 +       * Scan descriptor would be valid and would contain a valid snapshot
 +       * in cases when we need to send out of order command id to data node
 +       * e.g. in case of a fetch
 +       */
 +      cid = GetCurrentCommandId(false);
 +
 +      if (pgxc_node_send_cmd_id(connection, cid) < 0 )
 +              return false;
 +
 +      if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
 +              return false;
 +      if (step->statement || step->cursor || remotestate->rqs_num_params)
 +      {
 +              /* need to use Extended Query Protocol */
 +              int     fetch = 0;
 +              bool    prepared = false;
 +              char    nodetype = PGXC_NODE_DATANODE;
 +
 +              /* if prepared statement is referenced see if it is already
 +               * exist */
 +              if (step->statement)
 +                      prepared =
 +                              ActivateDatanodeStatementOnNode(step->statement,
 +                                              PGXCNodeGetNodeId(connection->nodeoid,
 +                                                      &nodetype));
 +
 +              /*
 +               * execute and fetch rows only if they will be consumed
 +               * immediately by the sorter
 +               */
 +              if (step->cursor)
 +                      fetch = 1;
 +
 +              combiner->extended_query = true;
 +
 +              if (pgxc_node_send_query_extended(connection,
 +                                                      prepared ? NULL : step->sql_statement,
 +                                                      step->statement,
 +                                                      step->cursor,
 +                                                      remotestate->rqs_num_params,
 +                                                      remotestate->rqs_param_types,
 +                                                      remotestate->paramval_len,
 +                                                      remotestate->paramval_data,
 +                                                      step->has_row_marks ? true : step->read_only,
 +                                                      fetch) != 0)
 +                      return false;
 +      }
 +      else
 +      {
 +              combiner->extended_query = false;
 +              if (pgxc_node_send_query(connection, step->sql_statement) != 0)
 +                      return false;
 +      }
 +      return true;
 +}
 +
 +/*
 + * Execute utility statement on multiple Datanodes
 + * It does approximately the same as
 + *
 + * RemoteQueryState *state = ExecInitRemoteQuery(plan, estate, flags);
 + * Assert(TupIsNull(ExecRemoteQuery(state));
 + * ExecEndRemoteQuery(state)
 + *
 + * But does not need an Estate instance and does not do some unnecessary work,
 + * like allocating tuple slots.
 + */
 +void
 +ExecRemoteUtility(RemoteQuery *node)
 +{
 +      RemoteQueryState *remotestate;
 +      ResponseCombiner *combiner;
 +      bool            force_autocommit = node->force_autocommit;
 +      RemoteQueryExecType exec_type = node->exec_type;
 +      GlobalTransactionId gxid = InvalidGlobalTransactionId;
 +      Snapshot snapshot = NULL;
 +      PGXCNodeAllHandles *pgxc_connections;
 +      int                     co_conn_count;
 +      int                     dn_conn_count;
 +      bool            need_tran_block;
 +      ExecDirectType          exec_direct_type = node->exec_direct_type;
 +      int                     i;
 +      CommandId       cid = GetCurrentCommandId(true);        
 +
 +      if (!force_autocommit)
 +              RegisterTransactionLocalNode(true);
 +
 +      remotestate = makeNode(RemoteQueryState);
 +      combiner = (ResponseCombiner *)remotestate;
 +      InitResponseCombiner(combiner, 0, node->combine_type);
 +
 +      /*
 +       * Do not set global_session if it is a utility statement. 
 +       * Avoids CREATE NODE error on cluster configuration.
 +       */
 +      pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type, 
 +                                                                                      exec_direct_type != EXEC_DIRECT_UTILITY);
 +
 +      dn_conn_count = pgxc_connections->dn_conn_count;
 +      co_conn_count = pgxc_connections->co_conn_count;
 +      /* exit right away if no nodes to run command on */
 +      if (dn_conn_count == 0 && co_conn_count == 0)
 +      {
 +              pfree_pgxc_all_handles(pgxc_connections);
 +              return;
 +      }
 +
 +      if (force_autocommit)
 +              need_tran_block = false;
 +      else
 +              need_tran_block = true;
 +
 +      /* Commands launched through EXECUTE DIRECT do not need start a transaction */
 +      if (exec_direct_type == EXEC_DIRECT_UTILITY)
 +      {
 +              need_tran_block = false;
 +
 +              /* This check is not done when analyzing to limit dependencies */
 +              if (IsTransactionBlock())
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
 +                                       errmsg("cannot run EXECUTE DIRECT with utility inside a transaction block")));
 +      }
 +
 +      gxid = GetCurrentTransactionId();
 +      if (ActiveSnapshotSet())
 +              snapshot = GetActiveSnapshot();
 +      if (!GlobalTransactionIdIsValid(gxid))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Failed to get next transaction ID")));
 +
 +      {
 +              if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles,
 +                                      gxid, need_tran_block, false, PGXC_NODE_DATANODE))
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Could not begin transaction on Datanodes")));
 +              for (i = 0; i < dn_conn_count; i++)
 +              {
 +                      PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
 +
 +                      if (conn->state == DN_CONNECTION_STATE_QUERY)
 +                              BufferConnection(conn);
 +                      if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send snapshot to Datanodes")));
 +                      }
 +                      if (pgxc_node_send_cmd_id(conn, cid) < 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send command ID to Datanodes")));
 +                      }
 +
 +                      if (pgxc_node_send_query(conn, node->sql_statement) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send command to Datanodes")));
 +                      }
 +              }
 +      }
 +
 +      {
 +              if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles,
 +                                      gxid, need_tran_block, false, PGXC_NODE_COORDINATOR))
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Could not begin transaction on coordinators")));
 +              /* Now send it to Coordinators if necessary */
 +              for (i = 0; i < co_conn_count; i++)
 +              {
 +                      if (snapshot && pgxc_node_send_snapshot(pgxc_connections->coord_handles[i], snapshot))
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send command to coordinators")));
 +                      }
 +                      if (pgxc_node_send_cmd_id(pgxc_connections->coord_handles[i], cid) < 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send command ID to Datanodes")));
 +                      }
 +
 +                      if (pgxc_node_send_query(pgxc_connections->coord_handles[i], node->sql_statement) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send command to coordinators")));
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Stop if all commands are completed or we got a data row and
 +       * initialized state node for subsequent invocations
 +       */
 +      {
 +              while (dn_conn_count > 0)
 +              {
 +                      int i = 0;
 +
 +                      if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL))
 +                              break;
 +                      /*
 +                       * Handle input from the Datanodes.
 +                       * We do not expect Datanodes returning tuples when running utility
 +                       * command.
 +                       * If we got EOF, move to the next connection, will receive more
 +                       * data on the next iteration.
 +                       */
 +                      while (i < dn_conn_count)
 +                      {
 +                              PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
 +                              int res = handle_response(conn, combiner);
 +                              if (res == RESPONSE_EOF)
 +                              {
 +                                      i++;
 +                              }
 +                              else if (res == RESPONSE_COMPLETE)
 +                              {
 +                                      /* Ignore, wait for ReadyForQuery */
 +                              }
 +                              else if (res == RESPONSE_ERROR)
 +                              {
 +                                      /* Ignore, wait for ReadyForQuery */
 +                              }
 +                              else if (res == RESPONSE_READY)
 +                              {
 +                                      if (i < --dn_conn_count)
 +                                              pgxc_connections->datanode_handles[i] =
 +                                                      pgxc_connections->datanode_handles[dn_conn_count];
 +                              }
 +                              else if (res == RESPONSE_TUPDESC)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Unexpected response from Datanode")));
 +                              }
 +                              else if (res == RESPONSE_DATAROW)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Unexpected response from Datanode")));
 +                              }
 +                      }
 +              }
 +      }
 +
 +      /* Make the same for Coordinators */
 +      {
 +              while (co_conn_count > 0)
 +              {
 +                      int i = 0;
 +
 +                      if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL))
 +                              break;
 +
 +                      while (i < co_conn_count)
 +                      {
 +                              int res = handle_response(pgxc_connections->coord_handles[i], combiner);
 +                              if (res == RESPONSE_EOF)
 +                              {
 +                                      i++;
 +                              }
 +                              else if (res == RESPONSE_COMPLETE)
 +                              {
 +                                      /* Ignore, wait for ReadyForQuery */
 +                              }
 +                              else if (res == RESPONSE_ERROR)
 +                              {
 +                                      /* Ignore, wait for ReadyForQuery */
 +                              }
 +                              else if (res == RESPONSE_READY)
 +                              {
 +                                      if (i < --co_conn_count)
 +                                              pgxc_connections->coord_handles[i] =
 +                                                       pgxc_connections->coord_handles[co_conn_count];
 +                              }
 +                              else if (res == RESPONSE_TUPDESC)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Unexpected response from coordinator")));
 +                              }
 +                              else if (res == RESPONSE_DATAROW)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Unexpected response from coordinator")));
 +                              }
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * We have processed all responses from nodes and if we have
 +       * error message pending we can report it. All connections should be in
 +       * consistent state now and so they can be released to the pool after ROLLBACK.
 +       */
 +      pfree_pgxc_all_handles(pgxc_connections);
 +      pgxc_node_report_error(combiner);
 +}
 +
 +
 +/*
 + * Called when the backend is ending.
 + */
 +void
 +PGXCNodeCleanAndRelease(int code, Datum arg)
 +{
 +
 +      /* Disconnect from Pooler, if any connection is still held Pooler close it */
 +      PoolManagerDisconnect();
 +
 +      /* Close connection with GTM */
 +      CloseGTM();
 +
 +      /* Dump collected statistics to the log */
 +      stat_log();
 +}
 +
 +void
 +ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
 +{
 +      PGXCNodeAllHandles *all_handles;
 +      PGXCNodeHandle    **connections;
 +      ResponseCombiner        combiner;
 +      int                                     conn_count;
 +      int                             i;
 +
 +      /* Exit if nodelist is empty */
 +      if (list_length(nodelist) == 0)
 +              return;
 +
 +      /* get needed Datanode connections */
 +      all_handles = get_handles(nodelist, NIL, false, true);
 +      conn_count = all_handles->dn_conn_count;
 +      connections = all_handles->datanode_handles;
 +
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
 +                      BufferConnection(connections[i]);
 +              if (pgxc_node_send_close(connections[i], true, stmt_name) != 0)
 +              {
 +                      /*
 +                       * statements are not affected by statement end, so consider
 +                       * unclosed statement on the Datanode as a fatal issue and
 +                       * force connection is discarded
 +                       */
 +                      PGXCNodeSetConnectionState(connections[i],
 +                                      DN_CONNECTION_STATE_ERROR_FATAL);
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to close Datanode statemrnt")));
 +              }
 +              if (pgxc_node_send_sync(connections[i]) != 0)
 +              {
 +                      PGXCNodeSetConnectionState(connections[i],
 +                                      DN_CONNECTION_STATE_ERROR_FATAL);
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to close Datanode statement")));
 +              }
 +              PGXCNodeSetConnectionState(connections[i], DN_CONNECTION_STATE_CLOSE);
 +      }
 +
 +      InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +      /*
 +       * Make sure there are zeroes in unused fields
 +       */
 +      memset(&combiner, 0, sizeof(ScanState));
 +
 +      while (conn_count > 0)
 +      {
 +              if (pgxc_node_receive(conn_count, connections, NULL))
 +              {
 +                      for (i = 0; i < conn_count; i++)
 +                              PGXCNodeSetConnectionState(connections[i],
 +                                              DN_CONNECTION_STATE_ERROR_FATAL);
 +
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to close Datanode statement")));
 +              }
 +              i = 0;
 +              while (i < conn_count)
 +              {
 +                      int res = handle_response(connections[i], &combiner);
 +                      if (res == RESPONSE_EOF)
 +                      {
 +                              i++;
 +                      }
 +                      else if (res == RESPONSE_READY ||
 +                                      connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL)
 +                      {
 +                              if (--conn_count > i)
 +                                      connections[i] = connections[conn_count];
 +                      }
 +              }
 +      }
 +
 +      ValidateAndCloseCombiner(&combiner);
 +      pfree_pgxc_all_handles(all_handles);
 +}
 +
 +/*
 + * DataNodeCopyInBinaryForAll
 + *
 + * In a COPY TO, send to all Datanodes PG_HEADER for a COPY TO in binary mode.
 + */
 +int
 +DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count,
 +                                                                        PGXCNodeHandle** connections)
 +{
 +      int             i;
 +      int msgLen = 4 + len;
 +      int nLen = htonl(msgLen);
 +
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = connections[i];
 +              if (handle->state == DN_CONNECTION_STATE_COPY_IN)
 +              {
 +                      /* msgType + msgLen */
 +                      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +                      {
 +                              ereport(ERROR,
 +                                      (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                      errmsg("out of memory")));
 +                      }
 +
 +                      handle->outBuffer[handle->outEnd++] = 'd';
 +                      memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
 +                      handle->outEnd += 4;
 +                      memcpy(handle->outBuffer + handle->outEnd, msg_buf, len);
 +                      handle->outEnd += len;
 +              }
 +              else
 +              {
 +                      add_error_message(handle, "Invalid Datanode connection");
 +                      return EOF;
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Encode parameter values to format of DataRow message (the same format is
 + * used in Bind) to prepare for sending down to Datanodes.
 + * The data row is copied to RemoteQueryState.paramval_data.
 + */
 +void
 +SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state)
 +{
 +      StringInfoData buf;
 +      uint16 n16;
 +      int i;
 +      int real_num_params = 0;
 +      RemoteQuery *node = (RemoteQuery*) rq_state->combiner.ss.ps.plan;
 +
 +      /* If there are no parameters, there is no data to BIND. */
 +      if (!paraminfo)
 +              return;
 +
 +      Assert(!rq_state->paramval_data);
 +
 +      /*
 +       * It is necessary to fetch parameters
 +       * before looking at the output value.
 +       */
 +      for (i = 0; i < paraminfo->numParams; i++)
 +      {
 +              ParamExternData *param;
 +
 +              param = &paraminfo->params[i];
 +
 +              if (!OidIsValid(param->ptype) && paraminfo->paramFetch != NULL)
 +                      (*paraminfo->paramFetch) (paraminfo, i + 1);
 +
 +              /*
 +               * This is the last parameter found as useful, so we need
 +               * to include all the previous ones to keep silent the remote
 +               * nodes. All the parameters prior to the last usable having no
 +               * type available will be considered as NULL entries.
 +               */
 +              if (OidIsValid(param->ptype))
 +                      real_num_params = i + 1;
 +      }
 +
 +      /*
 +       * If there are no parameters available, simply leave.
 +       * This is possible in the case of a query called through SPI
 +       * and using no parameters.
 +       */
 +      if (real_num_params == 0)
 +      {
 +              rq_state->paramval_data = NULL;
 +              rq_state->paramval_len = 0;
 +              return;
 +      }
 +
 +      initStringInfo(&buf);
 +
 +      /* Number of parameter values */
 +      n16 = htons(real_num_params);
 +      appendBinaryStringInfo(&buf, (char *) &n16, 2);
 +
 +      /* Parameter values */
 +      for (i = 0; i < real_num_params; i++)
 +      {
 +              ParamExternData *param = &paraminfo->params[i];
 +              uint32 n32;
 +
 +              /*
 +               * Parameters with no types are considered as NULL and treated as integer
 +               * The same trick is used for dropped columns for remote DML generation.
 +               */
 +              if (param->isnull || !OidIsValid(param->ptype))
 +              {
 +                      n32 = htonl(-1);
 +                      appendBinaryStringInfo(&buf, (char *) &n32, 4);
 +              }
 +              else
 +              {
 +                      Oid             typOutput;
 +                      bool    typIsVarlena;
 +                      Datum   pval;
 +                      char   *pstring;
 +                      int             len;
 +
 +                      /* Get info needed to output the value */
 +                      getTypeOutputInfo(param->ptype, &typOutput, &typIsVarlena);
 +
 +                      /*
 +                       * If we have a toasted datum, forcibly detoast it here to avoid
 +                       * memory leakage inside the type's output routine.
 +                       */
 +                      if (typIsVarlena)
 +                              pval = PointerGetDatum(PG_DETOAST_DATUM(param->value));
 +                      else
 +                              pval = param->value;
 +
 +                      /* Convert Datum to string */
 +                      pstring = OidOutputFunctionCall(typOutput, pval);
 +
 +                      /* copy data to the buffer */
 +                      len = strlen(pstring);
 +                      n32 = htonl(len);
 +                      appendBinaryStringInfo(&buf, (char *) &n32, 4);
 +                      appendBinaryStringInfo(&buf, pstring, len);
 +              }
 +      }
 +
 +
 +      /*
 +       * If parameter types are not already set, infer them from
 +       * the paraminfo.
 +       */
 +      if (node->rq_num_params > 0)
 +      {
 +              /*
 +               * Use the already known param types for BIND. Parameter types
 +               * can be already known when the same plan is executed multiple
 +               * times.
 +               */
 +              if (node->rq_num_params != real_num_params)
 +                      elog(ERROR, "Number of user-supplied parameters do not match "
 +                                              "the number of remote parameters");
 +              rq_state->rqs_num_params = node->rq_num_params;
 +              rq_state->rqs_param_types = node->rq_param_types;
 +      }
 +      else
 +      {
 +              rq_state->rqs_num_params = real_num_params;
 +              rq_state->rqs_param_types = (Oid *) palloc(sizeof(Oid) * real_num_params);
 +              for (i = 0; i < real_num_params; i++)
 +                      rq_state->rqs_param_types[i] = paraminfo->params[i].ptype;
 +      }
 +
 +      /* Assign the newly allocated data row to paramval */
 +      rq_state->paramval_data = buf.data;
 +      rq_state->paramval_len = buf.len;
 +}
 +
 +/*
 + * Clear per transaction remote information
 + */
 +void
 +AtEOXact_Remote(void)
 +{
 +      PGXCNodeResetParams(true);
 +}
 +
 +/*
 + * Invoked when local transaction is about to be committed.
 + * If nodestring is specified commit specified prepared transaction on remote
 + * nodes, otherwise commit remote nodes which are in transaction.
 + */
 +void
 +PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode)
 +{
 +      struct rusage           start_r;
 +      struct timeval          start_t;
 +
 +      if (log_gtm_stats)
 +              ResetUsageCommon(&start_r, &start_t);
 +
 +      /*
 +       * Made node connections persistent if we are committing transaction
 +       * that touched temporary tables. We never drop that flag, so after some
 +       * transaction has created a temp table the session's remote connections
 +       * become persistent.
 +       * We do not need to set that flag if transaction that has created a temp
 +       * table finally aborts - remote connections are not holding temporary
 +       * objects in this case.
 +       */
-       if (MyXactAccessedTempRel)
++      if (IS_PGXC_LOCAL_COORDINATOR &&
++              (MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL))
 +              temp_object_included = true;
 +
 +
 +      /*
 +       * OK, everything went fine. At least one remote node is in PREPARED state
 +       * and the transaction is successfully prepared on all the involved nodes.
 +       * Now we are ready to commit the transaction. We need a new GXID to send
 +       * down the remote nodes to execute the forthcoming COMMIT PREPARED
 +       * command. So grab one from the GTM and track it. It will be closed along
 +       * with the main transaction at the end.
 +       */
 +      if (nodestring)
 +      {
 +              Assert(preparedLocalNode);
 +              pgxc_node_remote_finish(prepareGID, true, nodestring,
 +                                                              GetAuxilliaryTransactionId(),
 +                                                              GetTopGlobalTransactionId());
 +
 +      }
 +      else
 +              pgxc_node_remote_commit();
 +
 +      if (log_gtm_stats)
 +              ShowUsageCommon("PreCommit_Remote", &start_r, &start_t);
 +}
 +
 +/*
 + * Do abort processing for the transaction. We must abort the transaction on
 + * all the involved nodes. If a node has already prepared a transaction, we run
 + * ROLLBACK PREPARED command on the node. Otherwise, a simple ROLLBACK command
 + * is sufficient.
 + *
 + * We must guard against the case when a transaction is prepared succefully on
 + * all the nodes and some error occurs after we send a COMMIT PREPARED message
 + * to at lease one node. Such a transaction must not be aborted to preserve
 + * global consistency. We handle this case by recording the nodes involved in
 + * the transaction at the GTM and keep the transaction open at the GTM so that
 + * its reported as "in-progress" on all the nodes until resolved
 + */
 +bool
 +PreAbort_Remote(void)
 +{
 +      /*
 +       * We are about to abort current transaction, and there could be an
 +       * unexpected error leaving the node connection in some state requiring
 +       * clean up, like COPY or pending query results.
 +       * If we are running copy we should send down CopyFail message and read
 +       * all possible incoming messages, there could be copy rows (if running
 +       * COPY TO) ErrorResponse, ReadyForQuery.
 +       * If there are pending results (connection state is DN_CONNECTION_STATE_QUERY)
 +       * we just need to read them in and discard, all necessary commands are
 +       * already sent. The end of input could be CommandComplete or
 +       * PortalSuspended, in either case subsequent ROLLBACK closes the portal.
 +       */
 +      PGXCNodeAllHandles *all_handles;
 +      PGXCNodeHandle     *clean_nodes[NumCoords + NumDataNodes];
 +      int                                     node_count = 0;
 +      int                                     cancel_dn_count = 0, cancel_co_count = 0;
 +      int                                     cancel_dn_list[NumDataNodes];
 +      int                                     cancel_co_list[NumCoords];
 +      int                             i;
 +      struct rusage           start_r;
 +      struct timeval          start_t;
 +
 +      if (log_gtm_stats)
 +              ResetUsageCommon(&start_r, &start_t);
 +
 +      all_handles = get_current_handles();
 +      /*
 +       * Find "dirty" coordinator connections.
 +       * COPY is never running on a coordinator connections, we just check for
 +       * pending data.
 +       */
 +      for (i = 0; i < all_handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = all_handles->coord_handles[i];
 +
 +              if (handle->state == DN_CONNECTION_STATE_QUERY)
 +              {
 +                      /*
 +                       * Forget previous combiner if any since input will be handled by
 +                       * different one.
 +                       */
 +                      handle->combiner = NULL;
 +                      clean_nodes[node_count++] = handle;
 +                      cancel_co_list[cancel_co_count++] = i;
 +              }
 +      }
 +
 +      /*
 +       * The same for data nodes, but cancel COPY if it is running.
 +       */
 +      for (i = 0; i < all_handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *handle = all_handles->datanode_handles[i];
 +
 +              if (handle->state == DN_CONNECTION_STATE_QUERY)
 +              {
 +                      /*
 +                       * Forget previous combiner if any since input will be handled by
 +                       * different one.
 +                       */
 +                      handle->combiner = NULL;
 +                      clean_nodes[node_count++] = handle;
 +                      cancel_dn_list[cancel_dn_count++] = i;
 +              }
 +              else if (handle->state == DN_CONNECTION_STATE_COPY_IN ||
 +                              handle->state == DN_CONNECTION_STATE_COPY_OUT)
 +              {
 +                      DataNodeCopyEnd(handle, true);
 +                      /*
 +                       * Forget previous combiner if any since input will be handled by
 +                       * different one.
 +                       */
 +                      handle->combiner = NULL;
 +                      clean_nodes[node_count++] = handle;
 +                      cancel_dn_list[cancel_dn_count++] = i;
 +              }
 +      }
 +
 +      /*
 +       * Cancel running queries on the datanodes and the coordinators.
 +       */
 +      PoolManagerCancelQuery(cancel_dn_count, cancel_dn_list, cancel_co_count,
 +                      cancel_co_list);
 +
 +      /*
 +       * Now read and discard any data from the connections found "dirty"
 +       */
 +      if (node_count > 0)
 +      {
 +              ResponseCombiner combiner;
 +
 +              InitResponseCombiner(&combiner, node_count, COMBINE_TYPE_NONE);
 +              /*
 +               * Make sure there are zeroes in unused fields
 +               */
 +              memset(&combiner, 0, sizeof(ScanState));
 +              combiner.connections = clean_nodes;
 +              combiner.conn_count = node_count;
 +              combiner.request_type = REQUEST_TYPE_ERROR;
 +
 +              pgxc_connections_cleanup(&combiner);
 +
 +              /* prevent pfree'ing local variable */
 +              combiner.connections = NULL;
 +
 +              CloseCombiner(&combiner);
 +      }
 +
 +      pgxc_node_remote_abort();
 +
 +      /*
 +       * Drop the connections to ensure aborts are handled properly.
 +       *
 +       * XXX We should really be consulting PersistentConnections parameter and
 +       * keep the connections if its set. But as a short term measure, to address
 +       * certain issues for aborted transactions, we drop the connections.
 +       * Revisit and fix the issue
 +       */
 +      elog(DEBUG5, "temp_object_included %d", temp_object_included);
 +      if (!temp_object_included)
 +      {
 +              /* Clean up remote sessions */
 +              pgxc_node_remote_cleanup_all();
 +              release_handles();
 +      }
 +
 +      pfree_pgxc_all_handles(all_handles);
 +
 +      if (log_gtm_stats)
 +              ShowUsageCommon("PreAbort_Remote", &start_r, &start_t);
 +
 +      return true;
 +}
 +
 +
 +/*
 + * Invoked when local transaction is about to be prepared.
 + * If invoked on a Datanode just commit transaction on remote connections,
 + * since secondary sessions are read only and never need to be prepared.
 + * Otherwise run PREPARE on remote connections, where writable commands were
 + * sent (connections marked as not read-only).
 + * If that is explicit PREPARE (issued by client) notify GTM.
 + * In case of implicit PREPARE not involving local node (ex. caused by
 + * INSERT, UPDATE or DELETE) commit prepared transaction immediately.
 + * Return list of node names where transaction was actually prepared, include
 + * the name of the local node if localNode is true.
 + */
 +char *
 +PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit)
 +{
 +      /* Always include local node if running explicit prepare */
 +      char *nodestring;
 +      struct rusage           start_r;
 +      struct timeval          start_t;
 +
 +      if (log_gtm_stats)
 +              ResetUsageCommon(&start_r, &start_t);
 +
 +      /*
 +       * Primary session is doing 2PC, just commit secondary processes and exit
 +       */
 +      if (IS_PGXC_DATANODE)
 +      {
 +              pgxc_node_remote_commit();
 +              return NULL;
 +      }
 +
 +      nodestring = pgxc_node_remote_prepare(prepareGID,
 +                                                                                              !implicit || localNode);
 +
 +      if (!implicit && IS_PGXC_LOCAL_COORDINATOR)
 +              /* Save the node list and gid on GTM. */
 +              StartPreparedTranGTM(GetTopGlobalTransactionId(), prepareGID,
 +                                                       nodestring);
 +
 +      /*
 +       * If no need to commit on local node go ahead and commit prepared
 +       * transaction right away.
 +       */
 +      if (implicit && !localNode && nodestring)
 +      {
 +              pgxc_node_remote_finish(prepareGID, true, nodestring,
 +                                                              GetAuxilliaryTransactionId(),
 +                                                              GetTopGlobalTransactionId());
 +              pfree(nodestring);
 +              nodestring = NULL;
 +      }
 +
 +      if (log_gtm_stats)
 +              ShowUsageCommon("PrePrepare_Remote", &start_r, &start_t);
 +
 +      return nodestring;
 +}
 +
 +/*
 + * Invoked immediately after local node is prepared.
 + * Notify GTM about completed prepare.
 + */
 +void
 +PostPrepare_Remote(char *prepareGID, bool implicit)
 +{
 +      struct rusage           start_r;
 +      struct timeval          start_t;
 +
 +      if (log_gtm_stats)
 +              ResetUsageCommon(&start_r, &start_t);
 +
 +      if (!implicit)
 +              PrepareTranGTM(GetTopGlobalTransactionId());
 +
 +      if (log_gtm_stats)
 +              ShowUsageCommon("PostPrepare_Remote", &start_r, &start_t);
 +}
 +
 +/*
 + * Returns true if 2PC is required for consistent commit: if there was write
 + * activity on two or more nodes within current transaction.
 + */
 +bool
 +IsTwoPhaseCommitRequired(bool localWrite)
 +{
 +      PGXCNodeAllHandles *handles;
 +      bool                            found = localWrite;
 +      int                             i;
 +
 +      /* Never run 2PC on Datanode-to-Datanode connection */
 +      if (IS_PGXC_DATANODE)
 +              return false;
 +
-       combiner->ss.ps.qual = NIL;
++      if (MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL)
 +      {
 +              elog(DEBUG1, "Transaction accessed temporary objects - "
 +                              "2PC will not be used and that can lead to data inconsistencies "
 +                              "in case of failures");
 +              return false;
 +      }
 +
 +      /*
 +       * If no XID assigned, no need to run 2PC since neither coordinator nor any
 +       * remote nodes did write operation
 +       */
 +      if (!TransactionIdIsValid(GetTopTransactionIdIfAny()))
 +              return false;
 +
 +      handles = get_current_handles();
 +      for (i = 0; i < handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->datanode_handles[i];
 +              if (conn->sock != NO_SOCKET && !conn->read_only &&
 +                              conn->transaction_status == 'T')
 +              {
 +                      if (found)
 +                              return true; /* second found */
 +                      else
 +                              found = true; /* first found */
 +              }
 +      }
 +      for (i = 0; i < handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = handles->coord_handles[i];
 +              if (conn->sock != NO_SOCKET && !conn->read_only &&
 +                              conn->transaction_status == 'T')
 +              {
 +                      if (found)
 +                              return true; /* second found */
 +                      else
 +                              found = true; /* first found */
 +              }
 +      }
 +      return false;
 +}
 +
 +/*
 + * Execute COMMIT/ABORT PREPARED issued by the remote client on remote nodes.
 + * Contacts GTM for the list of involved nodes and for work complete
 + * notification. Returns true if prepared transaction on local node needs to be
 + * finished too.
 + */
 +bool
 +FinishRemotePreparedTransaction(char *prepareGID, bool commit)
 +{
 +      char                               *nodestring;
 +      GlobalTransactionId             gxid, prepare_gxid;
 +      bool                                    prepared_local = false;
 +
 +      /*
 +       * Get the list of nodes involved in this transaction.
 +       *
 +       * This function returns the GXID of the prepared transaction. It also
 +       * returns a fresh GXID which can be used for running COMMIT PREPARED
 +       * commands on the remote nodes. Both these GXIDs can then be either
 +       * committed or aborted together.
 +       *
 +       * XXX While I understand that we get the prepared and a new GXID with a
 +       * single call, it doesn't look nicer and create confusion. We should
 +       * probably split them into two parts. This is used only for explicit 2PC
 +       * which should not be very common in XC
 +       *
 +       * In xc_maintenance_mode mode, we don't fail if the GTM does not have
 +       * knowledge about the prepared transaction. That may happen for various
 +       * reasons such that an earlier attempt cleaned up it from GTM or GTM was
 +       * restarted in between. The xc_maintenance_mode is a kludge to come out of
 +       * such situations. So it seems alright to not be too strict about the
 +       * state
 +       */
 +      if ((GetGIDDataGTM(prepareGID, &gxid, &prepare_gxid, &nodestring) < 0) &&
 +              !xc_maintenance_mode)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("prepared transaction with identifier \"%s\" does not exist",
 +                                              prepareGID)));
 +
 +      /*
 +       * Please note that with xc_maintenance_mode = on, COMMIT/ROLLBACK PREPARED will not
 +       * propagate to remote nodes. Only GTM status is cleaned up.
 +       */
 +      if (xc_maintenance_mode)
 +      {
 +              if (commit)
 +              {
 +                      pgxc_node_remote_commit();
 +                      CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL);
 +              }
 +              else
 +              {
 +                      pgxc_node_remote_abort();
 +                      RollbackTranGTM(prepare_gxid);
 +                      RollbackTranGTM(gxid);
 +              }
 +              return false;
 +      }
 +
 +      prepared_local = pgxc_node_remote_finish(prepareGID, commit, nodestring,
 +                                                                                       gxid, prepare_gxid);
 +
 +      if (commit)
 +      {
 +              /*
 +               * XXX For explicit 2PC, there will be enough delay for any
 +               * waited-committed transactions to send a final COMMIT message to the
 +               * GTM.
 +               */
 +              CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL);
 +      }
 +      else
 +      {
 +              RollbackTranGTM(prepare_gxid);
 +              RollbackTranGTM(gxid);
 +      }
 +
 +      return prepared_local;
 +}
 +
 +
 +/*
 + * Complete previously prepared transactions on remote nodes.
 + * Release remote connection after completion.
 + */
 +static bool
 +pgxc_node_remote_finish(char *prepareGID, bool commit,
 +                                              char *nodestring, GlobalTransactionId gxid,
 +                                              GlobalTransactionId prepare_gxid)
 +{
 +      char                       *finish_cmd;
 +      PGXCNodeHandle     *connections[MaxCoords + MaxDataNodes];
 +      int                                     conn_count = 0;
 +      ResponseCombiner        combiner;
 +      PGXCNodeAllHandles *pgxc_handles;
 +      bool                            prepared_local = false;
 +      char                       *nodename;
 +      List                       *nodelist = NIL;
 +      List                       *coordlist = NIL;
 +      int                                     i;
 +      /*
 +       * Now based on the nodestring, run COMMIT/ROLLBACK PREPARED command on the
 +       * remote nodes and also finish the transaction locally is required
 +       */
 +      nodename = strtok(nodestring, ",");
 +      while (nodename != NULL)
 +      {
 +              int             nodeIndex;
 +              char    nodetype;
 +
 +              /* Get node type and index */
 +              nodetype = PGXC_NODE_NONE;
 +              nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
 +              if (nodetype == PGXC_NODE_NONE)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                                       errmsg("PGXC Node %s: object not defined",
 +                                                      nodename)));
 +
 +              /* Check if node is requested is the self-node or not */
 +              if (nodetype == PGXC_NODE_COORDINATOR)
 +              {
 +                      if (nodeIndex == PGXCNodeId - 1)
 +                              prepared_local = true;
 +                      else
 +                              coordlist = lappend_int(coordlist, nodeIndex);
 +              }
 +              else
 +                      nodelist = lappend_int(nodelist, nodeIndex);
 +
 +              nodename = strtok(NULL, ",");
 +      }
 +
 +      if (nodelist == NIL && coordlist == NIL)
 +              return prepared_local;
 +
 +      pgxc_handles = get_handles(nodelist, coordlist, false, true);
 +
 +      finish_cmd = (char *) palloc(64 + strlen(prepareGID));
 +
 +      if (commit)
 +              sprintf(finish_cmd, "COMMIT PREPARED '%s'", prepareGID);
 +      else
 +              sprintf(finish_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
 +
 +      for (i = 0; i < pgxc_handles->dn_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = pgxc_handles->datanode_handles[i];
 +
 +              if (pgxc_node_send_gxid(conn, gxid))
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("failed to send GXID for %s PREPARED command",
 +                                                      commit ? "COMMIT" : "ROLLBACK")));
 +              }
 +
 +              if (pgxc_node_send_query(conn, finish_cmd))
 +              {
 +                      /*
 +                       * Do not bother with clean up, just bomb out. The error handler
 +                       * will invoke RollbackTransaction which will do the work.
 +                       */
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("failed to send %s PREPARED command to the node %u",
 +                                                      commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
 +              }
 +              else
 +              {
 +                      /* Read responses from these */
 +                      connections[conn_count++] = conn;
 +              }
 +      }
 +
 +      for (i = 0; i < pgxc_handles->co_conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = pgxc_handles->coord_handles[i];
 +
 +              if (pgxc_node_send_gxid(conn, gxid))
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("failed to send GXID for %s PREPARED command",
 +                                                      commit ? "COMMIT" : "ROLLBACK")));
 +              }
 +
 +              if (pgxc_node_send_query(conn, finish_cmd))
 +              {
 +                      /*
 +                       * Do not bother with clean up, just bomb out. The error handler
 +                       * will invoke RollbackTransaction which will do the work.
 +                       */
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("failed to send %s PREPARED command to the node %u",
 +                                                      commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
 +              }
 +              else
 +              {
 +                      /* Read responses from these */
 +                      connections[conn_count++] = conn;
 +              }
 +      }
 +
 +      if (conn_count)
 +      {
 +              InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
 +              /* Receive responses */
 +              if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
 +                              !validate_combiner(&combiner))
 +              {
 +                      if (combiner.errorMessage)
 +                              pgxc_node_report_error(&combiner);
 +                      else
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to COMMIT the transaction on one or more nodes")));
 +              }
 +              else
 +                      CloseCombiner(&combiner);
 +      }
 +
 +      if (!temp_object_included && !PersistentConnections)
 +      {
 +              /* Clean up remote sessions */
 +              pgxc_node_remote_cleanup_all();
 +              release_handles();
 +      }
 +
 +      pfree_pgxc_all_handles(pgxc_handles);
 +      pfree(finish_cmd);
 +
 +      return prepared_local;
 +}
 +
 +/*****************************************************************************
 + *
 + * Simplified versions of ExecInitRemoteQuery, ExecRemoteQuery and
 + * ExecEndRemoteQuery: in XCP they are only used to execute simple queries.
 + *
 + *****************************************************************************/
 +RemoteQueryState *
 +ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
 +{
 +      RemoteQueryState   *remotestate;
 +      ResponseCombiner   *combiner;
 +
 +      remotestate = makeNode(RemoteQueryState);
 +      combiner = (ResponseCombiner *) remotestate;
 +      InitResponseCombiner(combiner, 0, node->combine_type);
 +      combiner->ss.ps.plan = (Plan *) node;
 +      combiner->ss.ps.state = estate;
 +
-                                                                         true, resultslot, NULL))
++      combiner->ss.ps.qual = NULL;
 +
 +      combiner->request_type = REQUEST_TYPE_QUERY;
 +
 +      ExecInitResultTupleSlot(estate, &combiner->ss.ps);
 +      ExecAssignResultTypeFromTL((PlanState *) remotestate);
 +
 +      /*
 +       * If there are parameters supplied, get them into a form to be sent to the
 +       * Datanodes with bind message. We should not have had done this before.
 +       */
 +      SetDataRowForExtParams(estate->es_param_list_info, remotestate);
 +
 +      /* We need expression context to evaluate */
 +      if (node->exec_nodes && node->exec_nodes->en_expr)
 +      {
 +              Expr *expr = node->exec_nodes->en_expr;
 +
 +              if (IsA(expr, Var) && ((Var *) expr)->vartype == TIDOID)
 +              {
 +                      /* Special case if expression does not need to be evaluated */
 +              }
 +              else
 +              {
 +                      /* prepare expression evaluation */
 +                      ExecAssignExprContext(estate, &combiner->ss.ps);
 +              }
 +      }
 +
 +      return remotestate;
 +}
 +
 +
 +/*
 + * Execute step of PGXC plan.
 + * The step specifies a command to be executed on specified nodes.
 + * On first invocation connections to the data nodes are initialized and
 + * command is executed. Further, as well as within subsequent invocations,
 + * responses are received until step is completed or there is a tuple to emit.
 + * If there is a tuple it is returned, otherwise returned NULL. The NULL result
 + * from the function indicates completed step.
 + * The function returns at most one tuple per invocation.
 + */
 +TupleTableSlot *
 +ExecRemoteQuery(RemoteQueryState *node)
 +{
 +      ResponseCombiner *combiner = (ResponseCombiner *) node;
 +      RemoteQuery    *step = (RemoteQuery *) combiner->ss.ps.plan;
 +      TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
 +
 +      if (!node->query_Done)
 +      {
 +              GlobalTransactionId gxid = InvalidGlobalTransactionId;
 +              Snapshot                snapshot = GetActiveSnapshot();
 +              PGXCNodeHandle **connections = NULL;
 +              PGXCNodeHandle *primaryconnection = NULL;
 +              int                             i;
 +              int                             regular_conn_count = 0;
 +              int                             total_conn_count = 0;
 +              bool                    need_tran_block;
 +              PGXCNodeAllHandles *pgxc_connections;
 +
 +              /*
 +               * Get connections for Datanodes only, utilities and DDLs
 +               * are launched in ExecRemoteUtility
 +               */
 +              pgxc_connections = get_exec_connections(node, step->exec_nodes,
 +                                                                                              step->exec_type,
 +                                                                                              true);
 +
 +              if (step->exec_type == EXEC_ON_DATANODES)
 +              {
 +                      connections = pgxc_connections->datanode_handles;
 +                      total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count;
 +              }
 +              else if (step->exec_type == EXEC_ON_COORDS)
 +              {
 +                      connections = pgxc_connections->coord_handles;
 +                      total_conn_count = regular_conn_count = pgxc_connections->co_conn_count;
 +              }
 +
 +              primaryconnection = pgxc_connections->primary_handle;
 +
 +              /*
 +               * Primary connection is counted separately but is included in total_conn_count if used.
 +               */
 +              if (primaryconnection)
 +                      regular_conn_count--;
 +
 +              /*
 +               * We save only regular connections, at the time we exit the function
 +               * we finish with the primary connection and deal only with regular
 +               * connections on subsequent invocations
 +               */
 +              combiner->node_count = regular_conn_count;
 +
 +              /*
 +               * Start transaction on data nodes if we are in explicit transaction
 +               * or going to use extended query protocol or write to multiple nodes
 +               */
 +              if (step->force_autocommit)
 +                      need_tran_block = false;
 +              else
 +                      need_tran_block = step->cursor ||
 +                                      (!step->read_only && total_conn_count > 1) ||
 +                                      (TransactionBlockStatusCode() == 'T');
 +
 +              stat_statement();
 +              stat_transaction(total_conn_count);
 +
 +              gxid = GetCurrentTransactionIdIfAny();
 +              /* See if we have a primary node, execute on it first before the others */
 +              if (primaryconnection)
 +              {
 +                      if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block,
 +                                                              step->read_only, PGXC_NODE_DATANODE))
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Could not begin transaction on data node.")));
 +
 +                      /* If explicit transaction is needed gxid is already sent */
 +                      if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot))
 +                      {
 +                              pgxc_node_remote_abort();
 +                              pfree_pgxc_all_handles(pgxc_connections);
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send command to data nodes")));
 +                      }
 +                      Assert(combiner->combine_type == COMBINE_TYPE_SAME);
 +
 +                      pgxc_node_receive(1, &primaryconnection, NULL);
 +                      /* Make sure the command is completed on the primary node */
 +                      while (true)
 +                      {
 +                              int res = handle_response(primaryconnection, combiner);
 +                              if (res == RESPONSE_READY)
 +                                      break;
 +                              else if (res == RESPONSE_EOF)
 +                                      pgxc_node_receive(1, &primaryconnection, NULL);
 +                              else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR)
 +                                  /* Get ReadyForQuery */
 +                                      continue;
 +                              else if (res == RESPONSE_ASSIGN_GXID)
 +                                      continue;
 +                              else
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Unexpected response from data node")));
 +                      }
 +                      if (combiner->errorMessage)
 +                              pgxc_node_report_error(combiner);
 +              }
 +
 +              for (i = 0; i < regular_conn_count; i++)
 +              {
 +                      if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block,
 +                                                              step->read_only, PGXC_NODE_DATANODE))
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Could not begin transaction on data node.")));
 +
 +                      /* If explicit transaction is needed gxid is already sent */
 +                      if (!pgxc_start_command_on_connection(connections[i], node, snapshot))
 +                      {
 +                              pgxc_node_remote_abort();
 +                              pfree_pgxc_all_handles(pgxc_connections);
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to send command to data nodes")));
 +                      }
 +                      connections[i]->combiner = combiner;
 +              }
 +
 +              if (step->cursor)
 +              {
 +                      combiner->cursor = step->cursor;
 +                      combiner->cursor_count = regular_conn_count;
 +                      combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *));
 +                      memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *));
 +              }
 +
 +              combiner->connections = connections;
 +              combiner->conn_count = regular_conn_count;
 +              combiner->current_conn = 0;
 +
 +              if (combiner->cursor_count)
 +              {
 +                      combiner->conn_count = combiner->cursor_count;
 +                      memcpy(connections, combiner->cursor_connections,
 +                                 combiner->cursor_count * sizeof(PGXCNodeHandle *));
 +                      combiner->connections = connections;
 +              }
 +
 +              node->query_Done = true;
 +
 +              if (step->sort)
 +              {
 +                      SimpleSort *sort = step->sort;
 +
 +                      /*
 +                       * First message is already in the buffer
 +                       * Further fetch will be under tuplesort control
 +                       * If query does not produce rows tuplesort will not
 +                       * be initialized
 +                       */
 +                      combiner->tuplesortstate = tuplesort_begin_merge(
 +                                                                 resultslot->tts_tupleDescriptor,
 +                                                                 sort->numCols,
 +                                                                 sort->sortColIdx,
 +                                                                 sort->sortOperators,
 +                                                                 sort->sortCollations,
 +                                                                 sort->nullsFirst,
 +                                                                 combiner,
 +                                                                 work_mem);
 +              }
 +      }
 +
 +      if (combiner->tuplesortstate)
 +      {
 +              if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
-       combiner->ss.ps.qual = NIL;
++                                                                        true, true, resultslot, NULL))
 +                      return resultslot;
 +              else
 +                      ExecClearTuple(resultslot);
 +      }
 +      else
 +      {
 +              TupleTableSlot *slot = FetchTuple(combiner);
 +              if (!TupIsNull(slot))
 +                      return slot;
 +      }
 +
 +      if (combiner->errorMessage)
 +              pgxc_node_report_error(combiner);
 +
 +      return NULL;
 +}
 +
 +
 +/*
 + * Clean up and discard any data on the data node connections that might not
 + * handled yet, including pending on the remote connection.
 + */
 +static void
 +pgxc_connections_cleanup(ResponseCombiner *combiner)
 +{
 +      /* clean up the buffer */
 +      list_free_deep(combiner->rowBuffer);
 +      combiner->rowBuffer = NIL;
 +
 +      /*
 +       * Read in and discard remaining data from the connections, if any
 +       */
 +      combiner->current_conn = 0;
 +      while (combiner->conn_count > 0)
 +      {
 +              int res;
 +              PGXCNodeHandle *conn = combiner->connections[combiner->current_conn];
 +
 +              /*
 +               * Possible if we are doing merge sort.
 +               * We can do usual procedure and move connections around since we are
 +               * cleaning up and do not care what connection at what position
 +               */
 +              if (conn == NULL)
 +              {
 +                      REMOVE_CURR_CONN(combiner);
 +                      continue;
 +              }
 +
 +              /* throw away current message that may be in the buffer */
 +              if (combiner->currentRow)
 +              {
 +                      pfree(combiner->currentRow);
 +                      combiner->currentRow = NULL;
 +              }
 +
 +              /* no data is expected */
 +              if (conn->state == DN_CONNECTION_STATE_IDLE ||
 +                              conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
 +              {
 +                      REMOVE_CURR_CONN(combiner);
 +                      continue;
 +              }
 +
 +              /*
 +               * Connection owner is different, so no our data pending at
 +               * the connection, nothing to read in.
 +               */
 +              if (conn->combiner && conn->combiner != combiner)
 +              {
 +                      REMOVE_CURR_CONN(combiner);
 +                      continue;
 +              }
 +
 +              res = handle_response(conn, combiner);
 +              if (res == RESPONSE_EOF)
 +              {
 +                      struct timeval timeout;
 +                      timeout.tv_sec = END_QUERY_TIMEOUT / 1000;
 +                      timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000;
 +
 +                      if (pgxc_node_receive(1, &conn, &timeout))
 +                              elog(LOG, "Failed to read response from data nodes when ending query");
 +              }
 +      }
 +
 +      /*
 +       * Release tuplesort resources
 +       */
 +      if (combiner->tuplesortstate)
 +      {
 +              /*
 +               * Free these before tuplesort_end, because these arrays may appear
 +               * in the tuplesort's memory context, tuplesort_end deletes this
 +               * context and may invalidate the memory.
 +               * We still want to free them here, because these may be in different
 +               * context.
 +               */
 +              if (combiner->tapenodes)
 +              {
 +                      pfree(combiner->tapenodes);
 +                      combiner->tapenodes = NULL;
 +              }
 +              if (combiner->tapemarks)
 +              {
 +                      pfree(combiner->tapemarks);
 +                      combiner->tapemarks = NULL;
 +              }
 +              /*
 +               * tuplesort_end invalidates minimal tuple if it is in the slot because
 +               * deletes the TupleSort memory context, causing seg fault later when
 +               * releasing tuple table
 +               */
 +              ExecClearTuple(combiner->ss.ps.ps_ResultTupleSlot);
 +              tuplesort_end((Tuplesortstate *) combiner->tuplesortstate);
 +              combiner->tuplesortstate = NULL;
 +      }
 +}
 +
 +
 +/*
 + * End the remote query
 + */
 +void
 +ExecEndRemoteQuery(RemoteQueryState *node)
 +{
 +      ResponseCombiner *combiner = (ResponseCombiner *) node;
 +
 +      /*
 +       * Clean up remote connections
 +       */
 +      pgxc_connections_cleanup(combiner);
 +
 +      /*
 +       * Clean up parameters if they were set, since plan may be reused
 +       */
 +      if (node->paramval_data)
 +      {
 +              pfree(node->paramval_data);
 +              node->paramval_data = NULL;
 +              node->paramval_len = 0;
 +      }
 +
 +      CloseCombiner(combiner);
 +      pfree(node);
 +}
 +
 +
 +/**********************************************
 + *
 + * Routines to support RemoteSubplan plan node
 + *
 + **********************************************/
 +
 +
 +/*
 + * The routine walks recursively over the plan tree and changes cursor names of
 + * RemoteSubplan nodes to make them different from launched from the other
 + * datanodes. The routine changes cursor names in place, so caller should
 + * take writable copy of the plan tree.
 + */
 +void
 +RemoteSubplanMakeUnique(Node *plan, int unique)
 +{
 +      if (plan == NULL)
 +              return;
 +
 +      if (IsA(plan, List))
 +      {
 +              ListCell *lc;
 +              foreach(lc, (List *) plan)
 +              {
 +                      RemoteSubplanMakeUnique(lfirst(lc), unique);
 +              }
 +              return;
 +      }
 +
 +      /*
 +       * Transform SharedQueue name
 +       */
 +      if (IsA(plan, RemoteSubplan))
 +      {
 +              ((RemoteSubplan *)plan)->unique = unique;
 +      }
 +      /* Otherwise it is a Plan descendant */
 +      RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique);
 +      RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique);
 +      /* Tranform special cases */
 +      switch (nodeTag(plan))
 +      {
 +              case T_Append:
 +                      RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans,
 +                                                                      unique);
 +                      break;
 +              case T_MergeAppend:
 +                      RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans,
 +                                                                      unique);
 +                      break;
 +              case T_BitmapAnd:
 +                      RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans,
 +                                                                      unique);
 +                      break;
 +              case T_BitmapOr:
 +                      RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans,
 +                                                                      unique);
 +                      break;
 +              case T_SubqueryScan:
 +                      RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan,
 +                                                                      unique);
 +                      break;
 +              default:
 +                      break;
 +      }
 +}
 +
 +struct find_params_context
 +{
 +      RemoteParam *rparams;
 +      Bitmapset *defineParams;
 +};
 +
 +static bool
 +determine_param_types_walker(Node *node, struct find_params_context *context)
 +{
 +      if (node == NULL)
 +              return false;
 +
 +      if (IsA(node, Param))
 +      {
 +              Param *param = (Param *) node;
 +              int paramno = param->paramid;
 +
 +              if (param->paramkind == PARAM_EXEC &&
 +                              bms_is_member(paramno, context->defineParams))
 +              {
 +                      RemoteParam *cur = context->rparams;
 +                      while (cur->paramkind != PARAM_EXEC || cur->paramid != paramno)
 +                              cur++;
 +                      cur->paramtype = param->paramtype;
 +                      context->defineParams = bms_del_member(context->defineParams,
 +                                                                                                 paramno);
 +                      return bms_is_empty(context->defineParams);
 +              }
 +      }
 +      return expression_tree_walker(node, determine_param_types_walker,
 +                                                                (void *) context);
 +
 +}
 +
 +/*
 + * Scan expressions in the plan tree to find Param nodes and get data types
 + * from them
 + */
 +static bool
 +determine_param_types(Plan *plan,  struct find_params_context *context)
 +{
 +      Bitmapset *intersect;
 +
 +      if (plan == NULL)
 +              return false;
 +
 +      intersect = bms_intersect(plan->allParam, context->defineParams);
 +      if (bms_is_empty(intersect))
 +      {
 +              /* the subplan does not depend on params we are interested in */
 +              bms_free(intersect);
 +              return false;
 +      }
 +      bms_free(intersect);
 +
 +      /* scan target list */
 +      if (expression_tree_walker((Node *) plan->targetlist,
 +                                                         determine_param_types_walker,
 +                                                         (void *) context))
 +              return true;
 +      /* scan qual */
 +      if (expression_tree_walker((Node *) plan->qual,
 +                                                         determine_param_types_walker,
 +                                                         (void *) context))
 +              return true;
 +
 +      /* Check additional node-type-specific fields */
 +      switch (nodeTag(plan))
 +      {
 +              case T_Result:
 +                      if (expression_tree_walker((Node *) ((Result *) plan)->resconstantqual,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_SeqScan:
 +              case T_SampleScan:
 +              case T_CteScan:
 +                      break;
 +
 +              case T_IndexScan:
 +                      if (expression_tree_walker((Node *) ((IndexScan *) plan)->indexqual,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_IndexOnlyScan:
 +                      if (expression_tree_walker((Node *) ((IndexOnlyScan *) plan)->indexqual,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_BitmapIndexScan:
 +                      if (expression_tree_walker((Node *) ((BitmapIndexScan *) plan)->indexqual,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_BitmapHeapScan:
 +                      if (expression_tree_walker((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_TidScan:
 +                      if (expression_tree_walker((Node *) ((TidScan *) plan)->tidquals,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_SubqueryScan:
 +                      if (determine_param_types(((SubqueryScan *) plan)->subplan, context))
 +                              return true;
 +                      break;
 +
 +              case T_FunctionScan:
 +                      if (expression_tree_walker((Node *) ((FunctionScan *) plan)->functions,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_ValuesScan:
 +                      if (expression_tree_walker((Node *) ((ValuesScan *) plan)->values_lists,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_ModifyTable:
 +                      {
 +                              ListCell   *l;
 +
 +                              foreach(l, ((ModifyTable *) plan)->plans)
 +                              {
 +                                      if (determine_param_types((Plan *) lfirst(l), context))
 +                                              return true;
 +                              }
 +                      }
 +                      break;
 +
 +              case T_RemoteSubplan:
 +                      break;
 +
 +              case T_Append:
 +                      {
 +                              ListCell   *l;
 +
 +                              foreach(l, ((Append *) plan)->appendplans)
 +                              {
 +                                      if (determine_param_types((Plan *) lfirst(l), context))
 +                                              return true;
 +                              }
 +                      }
 +                      break;
 +
 +              case T_MergeAppend:
 +                      {
 +                              ListCell   *l;
 +
 +                              foreach(l, ((MergeAppend *) plan)->mergeplans)
 +                              {
 +                                      if (determine_param_types((Plan *) lfirst(l), context))
 +                                              return true;
 +                              }
 +                      }
 +                      break;
 +
 +              case T_BitmapAnd:
 +                      {
 +                              ListCell   *l;
 +
 +                              foreach(l, ((BitmapAnd *) plan)->bitmapplans)
 +                              {
 +                                      if (determine_param_types((Plan *) lfirst(l), context))
 +                                              return true;
 +                              }
 +                      }
 +                      break;
 +
 +              case T_BitmapOr:
 +                      {
 +                              ListCell   *l;
 +
 +                              foreach(l, ((BitmapOr *) plan)->bitmapplans)
 +                              {
 +                                      if (determine_param_types((Plan *) lfirst(l), context))
 +                                              return true;
 +                              }
 +                      }
 +                      break;
 +
 +              case T_NestLoop:
 +                      if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_MergeJoin:
 +                      if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      if (expression_tree_walker((Node *) ((MergeJoin *) plan)->mergeclauses,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_HashJoin:
 +                      if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      if (expression_tree_walker((Node *) ((HashJoin *) plan)->hashclauses,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_Limit:
 +                      if (expression_tree_walker((Node *) ((Limit *) plan)->limitOffset,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      if (expression_tree_walker((Node *) ((Limit *) plan)->limitCount,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                              return true;
 +                      break;
 +
 +              case T_RecursiveUnion:
 +                      break;
 +
 +              case T_LockRows:
 +                      break;
 +
 +              case T_WindowAgg:
 +                      if (expression_tree_walker((Node *) ((WindowAgg *) plan)->startOffset,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                      if (expression_tree_walker((Node *) ((WindowAgg *) plan)->endOffset,
 +                                                                         determine_param_types_walker,
 +                                                                         (void *) context))
 +                      break;
 +
 +              case T_Hash:
 +              case T_Agg:
 +              case T_Material:
 +              case T_Sort:
 +              case T_Unique:
 +              case T_SetOp:
 +              case T_Group:
 +                      break;
 +
 +              default:
 +                      elog(ERROR, "unrecognized node type: %d",
 +                               (int) nodeTag(plan));
 +      }
 +
 +
 +      /* recurse into subplans */
 +      return determine_param_types(plan->lefttree, context) ||
 +                      determine_param_types(plan->righttree, context);
 +}
 +
 +
 +RemoteSubplanState *
 +ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
 +{
 +      RemoteStmt                      rstmt;
 +      RemoteSubplanState *remotestate;
 +      ResponseCombiner   *combiner;
 +      CombineType                     combineType;
 +      struct rusage           start_r;
 +      struct timeval          start_t;
 +
 +      if (log_remotesubplan_stats)
 +              ResetUsageCommon(&start_r, &start_t);
 +
 +      remotestate = makeNode(RemoteSubplanState);
 +      combiner = (ResponseCombiner *) remotestate;
 +      /*
 +       * We do not need to combine row counts if we will receive intermediate
 +       * results or if we won't return row count.
 +       */
 +      if (IS_PGXC_DATANODE || estate->es_plannedstmt->commandType == CMD_SELECT)
 +      {
 +              combineType = COMBINE_TYPE_NONE;
 +              remotestate->execOnAll = node->execOnAll;
 +      }
 +      else
 +      {
 +              if (node->execOnAll)
 +                      combineType = COMBINE_TYPE_SUM;
 +              else
 +                      combineType = COMBINE_TYPE_SAME;
 +              /*
 +               * If we are updating replicated table we should run plan on all nodes.
 +               * We are choosing single node only to read
 +               */
 +              remotestate->execOnAll = true;
 +      }
 +      remotestate->execNodes = list_copy(node->nodeList);
 +      InitResponseCombiner(combiner, 0, combineType);
 +      combiner->ss.ps.plan = (Plan *) node;
 +      combiner->ss.ps.state = estate;
 +
-                                                                  true, resultslot, NULL))
++      combiner->ss.ps.qual = NULL;
 +
 +      combiner->request_type = REQUEST_TYPE_QUERY;
 +
 +      ExecInitResultTupleSlot(estate, &combiner->ss.ps);
 +      ExecAssignResultTypeFromTL((PlanState *) remotestate);
 +
 +      /*
 +       * We optimize execution if we going to send down query to next level
 +       */
 +      remotestate->local_exec = false;
 +      if (IS_PGXC_DATANODE)
 +      {
 +              if (remotestate->execNodes == NIL)
 +              {
 +                      /*
 +                       * Special case, if subplan is not distributed, like Result, or
 +                       * query against catalog tables only.
 +                       * We are only interested in filtering out the subplan results and
 +                       * get only those we are interested in.
 +                       * XXX we may want to prevent multiple executions in this case
 +                       * either, to achieve this we will set single execNode on planning
 +                       * time and this case would never happen, this code branch could
 +                       * be removed.
 +                       */
 +                      remotestate->local_exec = true;
 +              }
 +              else if (!remotestate->execOnAll)
 +              {
 +                      /*
 +                       * XXX We should change planner and remove this flag.
 +                       * We want only one node is producing the replicated result set,
 +                       * and planner should choose that node - it is too hard to determine
 +                       * right node at execution time, because it should be guaranteed
 +                       * that all consumers make the same decision.
 +                       * For now always execute replicated plan on local node to save
 +                       * resources.
 +                       */
 +
 +                      /*
 +                       * Make sure local node is in execution list
 +                       */
 +                      if (list_member_int(remotestate->execNodes, PGXCNodeId-1))
 +                      {
 +                              list_free(remotestate->execNodes);
 +                              remotestate->execNodes = NIL;
 +                              remotestate->local_exec = true;
 +                      }
 +                      else
 +                      {
 +                              /*
 +                               * To support, we need to connect to some producer, so
 +                               * each producer should be prepared to serve rows for random
 +                               * number of consumers. It is hard, because new consumer may
 +                               * connect after producing is started, on the other hand,
 +                               * absence of expected consumer is a problem too.
 +                               */
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                                               errmsg("Getting replicated results from remote node is not supported")));
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * If we are going to execute subplan locally or doing explain initialize
 +       * the subplan. Otherwise have remote node doing that.
 +       */
 +      if (remotestate->local_exec || (eflags & EXEC_FLAG_EXPLAIN_ONLY))
 +      {
 +              outerPlanState(remotestate) = ExecInitNode(outerPlan(node), estate,
 +                                                                                                 eflags);
 +              if (node->distributionNodes)
 +              {
 +                      Oid             distributionType = InvalidOid;
 +                      TupleDesc       typeInfo;
 +
 +                      typeInfo = combiner->ss.ps.ps_ResultTupleSlot->tts_tupleDescriptor;
 +                      if (node->distributionKey != InvalidAttrNumber)
 +                      {
 +                              Form_pg_attribute attr;
 +                              attr = typeInfo->attrs[node->distributionKey - 1];
 +                              distributionType = attr->atttypid;
 +                      }
 +                      /* Set up locator */
 +                      remotestate->locator = createLocator(node->distributionType,
 +                                                                                               RELATION_ACCESS_INSERT,
 +                                                                                               distributionType,
 +                                                                                               LOCATOR_LIST_LIST,
 +                                                                                               0,
 +                                                                                               (void *) node->distributionNodes,
 +                                                                                               (void **) &remotestate->dest_nodes,
 +                                                                                               false);
 +              }
 +              else
 +                      remotestate->locator = NULL;
 +      }
 +
 +      /*
 +       * Encode subplan if it will be sent to remote nodes
 +       */
 +      if (remotestate->execNodes && !(eflags & EXEC_FLAG_EXPLAIN_ONLY))
 +      {
 +              ParamListInfo ext_params;
 +              /* Encode plan if we are going to execute it on other nodes */
 +              rstmt.type = T_RemoteStmt;
 +              if (node->distributionType == LOCATOR_TYPE_NONE && IS_PGXC_DATANODE)
 +              {
 +                      /*
 +                       * There are cases when planner can not determine distribution of a
 +                       * subplan, in particular it does not determine distribution of
 +                       * subquery nodes. Such subplans executed from current location
 +                       * (node) and combine all results, like from coordinator nodes.
 +                       * However, if there are multiple locations where distributed
 +                       * executor is running this node, and there are more of
 +                       * RemoteSubplan plan nodes in the subtree there will be a problem -
 +                       * Instances of the inner RemoteSubplan nodes will be using the same
 +                       * SharedQueue, causing error. To avoid this problem we should
 +                       * traverse the subtree and change SharedQueue name to make it
 +                       * unique.
 +                       */
 +                      RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId);
 +              }
 +              rstmt.planTree = outerPlan(node);
 +              /*
 +               * If datanode launch further execution of a command it should tell
 +               * it is a SELECT, otherwise secondary data nodes won't return tuples
 +               * expecting there will be nothing to return.
 +               */
 +              if (IsA(outerPlan(node), ModifyTable))
 +              {
 +                      rstmt.commandType = estate->es_plannedstmt->commandType;
 +                      rstmt.hasReturning = estate->es_plannedstmt->hasReturning;
 +                      rstmt.resultRelations = estate->es_plannedstmt->resultRelations;
 +              }
 +              else
 +              {
 +                      rstmt.commandType = CMD_SELECT;
 +                      rstmt.hasReturning = false;
 +                      rstmt.resultRelations = NIL;
 +              }
 +              rstmt.rtable = estate->es_range_table;
 +              rstmt.subplans = estate->es_plannedstmt->subplans;
 +              rstmt.nParamExec = estate->es_plannedstmt->nParamExec;
 +              ext_params = estate->es_param_list_info;
 +              rstmt.nParamRemote = (ext_params ? ext_params->numParams : 0) +
 +                              bms_num_members(node->scan.plan.allParam);
 +              if (rstmt.nParamRemote > 0)
 +              {
 +                      Bitmapset *tmpset;
 +                      int i;
 +                      int paramno;
 +
 +                      /* Allocate enough space */
 +                      rstmt.remoteparams = (RemoteParam *) palloc(rstmt.nParamRemote *
 +                                                                                                              sizeof(RemoteParam));
 +                      paramno = 0;
 +                      if (ext_params)
 +                      {
 +                              for (i = 0; i < ext_params->numParams; i++)
 +                              {
 +                                      ParamExternData *param = &ext_params->params[i];
 +                                      /*
 +                                       * If parameter type is not yet defined but can be defined
 +                                       * do that
 +                                       */
 +                                      if (!OidIsValid(param->ptype) && ext_params->paramFetch)
 +                                              (*ext_params->paramFetch) (ext_params, i + 1);
 +
 +                                      /*
 +                                       * If the parameter type is still not defined, assume that
 +                                       * it is unused. But we put a default INT4OID type for such
 +                                       * unused parameters to keep the parameter pushdown code
 +                                       * happy.
 +                                       *
 +                                       * These unused parameters are never accessed during
 +                                       * execution and we will just a null value for these
 +                                       * "dummy" parameters. But including them here ensures that
 +                                       * we send down the parameters in the correct order and at
 +                                       * the position that the datanode needs
 +                                       */
 +                                      if (OidIsValid(param->ptype))
 +                                      {
 +                                              rstmt.remoteparams[paramno].paramused = 1;
 +                                              rstmt.remoteparams[paramno].paramtype = param->ptype;
 +                                      }
 +                                      else
 +                                      {
 +                                              rstmt.remoteparams[paramno].paramused = 0;
 +                                              rstmt.remoteparams[paramno].paramtype = INT4OID;
 +                                      }
 +
 +                                      rstmt.remoteparams[paramno].paramkind = PARAM_EXTERN;
 +                                      rstmt.remoteparams[paramno].paramid = i + 1;
 +                                      paramno++;
 +                              }
 +                              /* store actual number of parameters */
 +                              rstmt.nParamRemote = paramno;
 +                      }
 +
 +                      if (!bms_is_empty(node->scan.plan.allParam))
 +                      {
 +                              Bitmapset *defineParams = NULL;
 +                              tmpset = bms_copy(node->scan.plan.allParam);
 +                              while ((i = bms_first_member(tmpset)) >= 0)
 +                              {
 +                                      ParamExecData *prmdata;
 +
 +                                      prmdata = &(estate->es_param_exec_vals[i]);
 +                                      rstmt.remoteparams[paramno].paramkind = PARAM_EXEC;
 +                                      rstmt.remoteparams[paramno].paramid = i;
 +                                      rstmt.remoteparams[paramno].paramtype = prmdata->ptype;
 +                                      rstmt.remoteparams[paramno].paramused = 1;
 +                                      /* Will scan plan tree to find out data type of the param */
 +                                      if (prmdata->ptype == InvalidOid)
 +                                              defineParams = bms_add_member(defineParams, i);
 +                                      paramno++;
 +                              }
 +                              /* store actual number of parameters */
 +                              rstmt.nParamRemote = paramno;
 +                              bms_free(tmpset);
 +                              if (!bms_is_empty(defineParams))
 +                              {
 +                                      struct find_params_context context;
 +                                      bool all_found;
 +
 +                                      context.rparams = rstmt.remoteparams;
 +                                      context.defineParams = defineParams;
 +
 +                                      all_found = determine_param_types(node->scan.plan.lefttree,
 +                                                                                                        &context);
 +                                      /*
 +                                       * Remove not defined params from the list of remote params.
 +                                       * If they are not referenced no need to send them down
 +                                       */
 +                                      if (!all_found)
 +                                      {
 +                                              for (i = 0; i < rstmt.nParamRemote; i++)
 +                                              {
 +                                                      if (rstmt.remoteparams[i].paramkind == PARAM_EXEC &&
 +                                                                      bms_is_member(rstmt.remoteparams[i].paramid,
 +                                                                                                context.defineParams))
 +                                                      {
 +                                                              /* Copy last parameter inplace */
 +                                                              rstmt.nParamRemote--;
 +                                                              if (i < rstmt.nParamRemote)
 +                                                                      rstmt.remoteparams[i] =
 +                                                                              rstmt.remoteparams[rstmt.nParamRemote];
 +                                                              /* keep current in the same position */
 +                                                              i--;
 +                                                      }
 +                                              }
 +                                      }
 +                                      bms_free(context.defineParams);
 +                              }
 +                      }
 +                      remotestate->nParamRemote = rstmt.nParamRemote;
 +                      remotestate->remoteparams = rstmt.remoteparams;
 +              }
 +              else
 +                      rstmt.remoteparams = NULL;
 +              rstmt.rowMarks = estate->es_plannedstmt->rowMarks;
 +              rstmt.distributionKey = node->distributionKey;
 +              rstmt.distributionType = node->distributionType;
 +              rstmt.distributionNodes = node->distributionNodes;
 +              rstmt.distributionRestrict = node->distributionRestrict;
 +
 +              set_portable_output(true);
 +              remotestate->subplanstr = nodeToString(&rstmt);
 +              set_portable_output(false);
 +
 +              /*
 +               * Connect to remote nodes and send down subplan
 +               */
 +              if (!(eflags & EXEC_FLAG_SUBPLAN))
 +                      ExecFinishInitRemoteSubplan(remotestate);
 +      }
 +      remotestate->bound = false;
 +      /*
 +       * It does not makes sense to merge sort if there is only one tuple source.
 +       * By the contract it is already sorted
 +       */
 +      if (node->sort && remotestate->execOnAll &&
 +                      list_length(remotestate->execNodes) > 1)
 +              combiner->merge_sort = true;
 +
 +      if (log_remotesubplan_stats)
 +              ShowUsageCommon("ExecInitRemoteSubplan", &start_r, &start_t);
 +
 +      return remotestate;
 +}
 +
 +
 +void
 +ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
 +{
 +      ResponseCombiner   *combiner = (ResponseCombiner *) node;
 +      RemoteSubplan      *plan = (RemoteSubplan *) combiner->ss.ps.plan;
 +      EState                     *estate = combiner->ss.ps.state;
 +      Oid                        *paramtypes = NULL;
 +      GlobalTransactionId gxid = InvalidGlobalTransactionId;
 +      Snapshot                        snapshot;
 +      TimestampTz                     timestamp;
 +      int                             i;
 +      bool                            is_read_only;
 +      char                            cursor[NAMEDATALEN];
 +
 +      /*
 +       * Name is required to store plan as a statement
 +       */
 +      Assert(plan->cursor);
 +
 +      if (plan->unique)
 +              snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
 +      else
 +              strncpy(cursor, plan->cursor, NAMEDATALEN);
 +
 +      /* If it is alreaty fully initialized nothing to do */
 +      if (combiner->connections)
 +              return;
 +
 +      /* local only or explain only execution */
 +      if (node->subplanstr == NULL)
 +              return;
 +
 +      /* 
 +       * Check if any results are planned to be received here.
 +       * Otherwise it does not make sense to send out the subplan.
 +       */
 +      if (IS_PGXC_DATANODE && plan->distributionRestrict && 
 +                      !list_member_int(plan->distributionRestrict, PGXCNodeId - 1))
 +              return;
 +
 +      /*
 +       * Acquire connections and send down subplan where it will be stored
 +       * as a prepared statement.
 +       * That does not require transaction id or snapshot, so does not send them
 +       * here, postpone till bind.
 +       */
 +      if (node->execOnAll)
 +      {
 +              PGXCNodeAllHandles *pgxc_connections;
 +              pgxc_connections = get_handles(node->execNodes, NIL, false, true);
 +              combiner->conn_count = pgxc_connections->dn_conn_count;
 +              combiner->connections = pgxc_connections->datanode_handles;
 +              combiner->current_conn = 0;
 +              pfree(pgxc_connections);
 +      }
 +      else
 +      {
 +              combiner->connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
 +              combiner->connections[0] = get_any_handle(node->execNodes);
 +              combiner->conn_count = 1;
 +              combiner->current_conn = 0;
 +      }
 +
 +      gxid = GetCurrentTransactionIdIfAny();
 +
 +      /* extract parameter data types */
 +      if (node->nParamRemote > 0)
 +      {
 +              paramtypes = (Oid *) palloc(node->nParamRemote * sizeof(Oid));
 +              for (i = 0; i < node->nParamRemote; i++)
 +                      paramtypes[i] = node->remoteparams[i].paramtype;
 +      }
 +      /* send down subplan */
 +      snapshot = GetActiveSnapshot();
 +      timestamp = GetCurrentGTMStartTimestamp();
 +      /*
 +       * Datanode should not send down statements that may modify
 +       * the database. Potgres assumes that all sessions under the same
 +       * postmaster have different xids. That may cause a locking problem.
 +       * Shared locks acquired for reading still work fine.
 +       */
 +      is_read_only = IS_PGXC_DATANODE ||
 +                      !IsA(outerPlan(plan), ModifyTable);
 +
 +      for (i = 0; i < combiner->conn_count; i++)
 +      {
 +              PGXCNodeHandle *connection = combiner->connections[i];
 +
 +              if (pgxc_node_begin(1, &connection, gxid, true,
 +                                                      is_read_only, PGXC_NODE_DATANODE))
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Could not begin transaction on data node.")));
 +
 +              if (pgxc_node_send_timestamp(connection, timestamp))
 +              {
 +                      combiner->conn_count = 0;
 +                      pfree(combiner->connections);
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to send command to data nodes")));
 +              }
 +              if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
 +              {
 +                      combiner->conn_count = 0;
 +                      pfree(combiner->connections);
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to send snapshot to data nodes")));
 +              }
 +              if (pgxc_node_send_cmd_id(connection, estate->es_snapshot->curcid) < 0 )
 +              {
 +                      combiner->conn_count = 0;
 +                      pfree(combiner->connections);
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to send command ID to data nodes")));
 +              }
 +              pgxc_node_send_plan(connection, cursor, "Remote Subplan",
 +                                                      node->subplanstr, node->nParamRemote, paramtypes);
 +              if (pgxc_node_flush(connection))
 +              {
 +                      combiner->conn_count = 0;
 +                      pfree(combiner->connections);
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to send subplan to data nodes")));
 +              }
 +      }
 +}
 +
 +
 +static void
 +append_param_data(StringInfo buf, Oid ptype, int pused, Datum value, bool isnull)
 +{
 +      uint32 n32;
 +
 +      /* Assume unused parameters to have null values */
 +      if (!pused)
 +              ptype = INT4OID;
 +
 +      if (isnull)
 +      {
 +              n32 = htonl(-1);
 +              appendBinaryStringInfo(buf, (char *) &n32, 4);
 +      }
 +      else
 +      {
 +              Oid             typOutput;
 +              bool    typIsVarlena;
 +              Datum   pval;
 +              char   *pstring;
 +              int             len;
 +
 +              /* Get info needed to output the value */
 +              getTypeOutputInfo(ptype, &typOutput, &typIsVarlena);
 +
 +              /*
 +               * If we have a toasted datum, forcibly detoast it here to avoid
 +               * memory leakage inside the type's output routine.
 +               */
 +              if (typIsVarlena)
 +                      pval = PointerGetDatum(PG_DETOAST_DATUM(value));
 +              else
 +                      pval = value;
 +
 +              /* Convert Datum to string */
 +              pstring = OidOutputFunctionCall(typOutput, pval);
 +
 +              /* copy data to the buffer */
 +              len = strlen(pstring);
 +              n32 = htonl(len);
 +              appendBinaryStringInfo(buf, (char *) &n32, 4);
 +              appendBinaryStringInfo(buf, pstring, len);
 +      }
 +}
 +
 +
 +static int encode_parameters(int nparams, RemoteParam *remoteparams,
 +                                                       PlanState *planstate, char** result)
 +{
 +      EState             *estate = planstate->state;
 +      StringInfoData  buf;
 +      uint16                  n16;
 +      int                     i;
 +      ExprContext        *econtext;
 +      MemoryContext   oldcontext;
 +
 +      if (planstate->ps_ExprContext == NULL)
 +              ExecAssignExprContext(estate, planstate);
 +
 +      econtext = planstate->ps_ExprContext;
 +      oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
 +      MemoryContextReset(econtext->ecxt_per_tuple_memory);
 +
 +      initStringInfo(&buf);
 +
 +      /* Number of parameter values */
 +      n16 = htons(nparams);
 +      appendBinaryStringInfo(&buf, (char *) &n16, 2);
 +
 +      /* Parameter values */
 +      for (i = 0; i < nparams; i++)
 +      {
 +              RemoteParam *rparam = &remoteparams[i];
 +              int ptype = rparam->paramtype;
 +              int pused = rparam->paramused;
 +              if (rparam->paramkind == PARAM_EXTERN)
 +              {
 +                      ParamExternData *param;
 +                      param = &(estate->es_param_list_info->params[rparam->paramid - 1]);
 +                      append_param_data(&buf, ptype, pused, param->value, param->isnull);
 +              }
 +              else
 +              {
 +                      ParamExecData *param;
 +                      param = &(estate->es_param_exec_vals[rparam->paramid]);
 +                      if (param->execPlan)
 +                      {
 +                              /* Parameter not evaluated yet, so go do it */
 +                              ExecSetParamPlan((SubPlanState *) param->execPlan,
 +                                                               planstate->ps_ExprContext);
 +                              /* ExecSetParamPlan should have processed this param... */
 +                              Assert(param->execPlan == NULL);
 +                      }
 +                      if (!param->done)
 +                              param->isnull = true;
 +                      append_param_data(&buf, ptype, pused, param->value, param->isnull);
 +
 +              }
 +      }
 +
 +      /* Take data from the buffer */
 +      *result = palloc(buf.len);
 +      memcpy(*result, buf.data, buf.len);
 +      MemoryContextSwitchTo(oldcontext);
 +      return buf.len;
 +}
 +
 +
 +TupleTableSlot *
 +ExecRemoteSubplan(RemoteSubplanState *node)
 +{
 +      ResponseCombiner *combiner = (ResponseCombiner *) node;
 +      RemoteSubplan  *plan = (RemoteSubplan *) combiner->ss.ps.plan;
 +      EState             *estate = combiner->ss.ps.state;
 +      TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
 +      struct rusage   start_r;
 +      struct timeval          start_t;
 +
 +      /* 
 +       * We allow combiner->conn_count == 0 after node initialization
 +       * if we figured out that current node won't receive any result
 +       * because of distributionRestrict is set by planner.
 +       * But we should distinguish this case from others, when conn_count is 0.
 +       * That is possible if local execution is chosen or data are buffered 
 +       * at the coordinator or data are exhausted and node was reset.
 +       * in last two cases connections are saved to cursor_connections and we
 +       * can check their presence.  
 +       */
 +      if (!node->local_exec && combiner->conn_count == 0 && 
 +                      combiner->cursor_count == 0)
 +              return NULL;
 +
 +      if (log_remotesubplan_stats)
 +              ResetUsageCommon(&start_r, &start_t);
 +
 +primary_mode_phase_two:
 +      if (!node->bound)
 +      {
 +              int fetch = 0;
 +              int paramlen = 0;
 +              char *paramdata = NULL;
 +              /*
 +               * Conditions when we want to execute query on the primary node first:
 +               * Coordinator running replicated ModifyTable on multiple nodes
 +               */
 +              bool primary_mode = combiner->probing_primary ||
 +                              (IS_PGXC_COORDINATOR &&
 +                               combiner->combine_type == COMBINE_TYPE_SAME &&
 +                               OidIsValid(primary_data_node) &&
 +                               combiner->conn_count > 1);
 +              char cursor[NAMEDATALEN];
 +
 +              if (plan->cursor)
 +              {
 +                      fetch = PGXLRemoteFetchSize;
 +                      if (plan->unique)
 +                              snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
 +                      else
 +                              strncpy(cursor, plan->cursor, NAMEDATALEN);
 +              }
 +              else
 +                      cursor[0] = '\0';
 +
 +              /*
 +               * Send down all available parameters, if any is used by the plan
 +               */
 +              if (estate->es_param_list_info ||
 +                              !bms_is_empty(plan->scan.plan.allParam))
 +                      paramlen = encode_parameters(node->nParamRemote,
 +                                                                               node->remoteparams,
 +                                                                               &combiner->ss.ps,
 +                                                                               &paramdata);
 +
 +              /*
 +               * The subplan being rescanned, need to restore connections and
 +               * re-bind the portal
 +               */
 +              if (combiner->cursor)
 +              {
 +                      int i;
 +
 +                      /*
 +                       * On second phase of primary mode connections are properly set,
 +                       * so do not copy.
 +                       */
 +                      if (!combiner->probing_primary)
 +                      {
 +                              combiner->conn_count = combiner->cursor_count;
 +                              memcpy(combiner->connections, combiner->cursor_connections,
 +                                                      combiner->cursor_count * sizeof(PGXCNodeHandle *));
 +                      }
 +
 +                      for (i = 0; i < combiner->conn_count; i++)
 +                      {
 +                              PGXCNodeHandle *conn = combiner->connections[i];
 +
 +                              CHECK_OWNERSHIP(conn, combiner);
 +
 +                              /* close previous cursor only on phase 1 */
 +                              if (!primary_mode || !combiner->probing_primary)
 +                                      pgxc_node_send_close(conn, false, combiner->cursor);
 +
 +                              /*
 +                               * If we now should probe primary, skip execution on non-primary
 +                               * nodes
 +                               */
 +                              if (primary_mode && !combiner->probing_primary &&
 +                                              conn->nodeoid != primary_data_node)
 +                                      continue;
 +
 +                              /* rebind */
 +                              pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor,
 +                                                                      paramlen, paramdata);
 +                              /* execute */
 +                              pgxc_node_send_execute(conn, combiner->cursor, fetch);
 +                              /* submit */
 +                              if (pgxc_node_send_flush(conn))
 +                              {
 +                                      combiner->conn_count = 0;
 +                                      pfree(combiner->connections);
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to send command to data nodes")));
 +                              }
 +
 +                              /*
 +                               * There could be only one primary node, but can not leave the
 +                               * loop now, because we need to close cursors.
 +                               */
 +                              if (primary_mode && !combiner->probing_primary)
 +                              {
 +                                      combiner->current_conn = i;
 +                              }
 +                      }
 +              }
 +              else if (node->execNodes)
 +              {
 +                      CommandId               cid;
 +                      int                     i;
 +
 +                      /*
 +                       * There are prepared statement, connections should be already here
 +                       */
 +                      Assert(combiner->conn_count > 0);
 +
 +                      combiner->extended_query = true;
 +                      cid = estate->es_snapshot->curcid;
 +
 +                      for (i = 0; i < combiner->conn_count; i++)
 +                      {
 +                              PGXCNodeHandle *conn = combiner->connections[i];
 +
 +                              CHECK_OWNERSHIP(conn, combiner);
 +
 +                              /*
 +                               * If we now should probe primary, skip execution on non-primary
 +                               * nodes
 +                               */
 +                              if (primary_mode && !combiner->probing_primary &&
 +                                              conn->nodeoid != primary_data_node)
 +                                      continue;
 +
 +                              /*
 +                               * Update Command Id. Other command may be executed after we
 +                               * prepare and advanced Command Id. We should use one that
 +                               * was active at the moment when command started.
 +                               */
 +                              if (pgxc_node_send_cmd_id(conn, cid))
 +                              {
 +                                      combiner->conn_count = 0;
 +                                      pfree(combiner->connections);
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to send command ID to data nodes")));
 +                              }
 +
 +                              /*
 +                               * Resend the snapshot as well since the connection may have
 +                               * been buffered and use by other commands, with different
 +                               * snapshot. Set the snapshot back to what it was
 +                               */
 +                              if (pgxc_node_send_snapshot(conn, estate->es_snapshot))
 +                              {
 +                                      combiner->conn_count = 0;
 +                                      pfree(combiner->connections);
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to send snapshot to data nodes")));
 +                              }
 +
 +                              /* bind */
 +                              pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata);
 +                              /* execute */
 +                              pgxc_node_send_execute(conn, cursor, fetch);
 +                              /* submit */
 +                              if (pgxc_node_send_flush(conn))
 +                              {
 +                                      combiner->conn_count = 0;
 +                                      pfree(combiner->connections);
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                                       errmsg("Failed to send command to data nodes")));
 +                              }
 +
 +                              /*
 +                               * There could be only one primary node, so if we executed
 +                               * subquery on the phase one of primary mode we can leave the
 +                               * loop now.
 +                               */
 +                              if (primary_mode && !combiner->probing_primary)
 +                              {
 +                                      combiner->current_conn = i;
 +                                      break;
 +                              }
 +                      }
 +
 +                      /*
 +                       * On second phase of primary mode connections are backed up
 +                       * already, so do not copy.
 +                       */
 +                      if (primary_mode)
 +                      {
 +                              if (combiner->probing_primary)
 +                              {
 +                                      combiner->cursor = pstrdup(cursor);
 +                              }
 +                              else
 +                              {
 +                                      combiner->cursor = pstrdup(cursor);
 +                                      combiner->cursor_count = combiner->conn_count;
 +                                      combiner->cursor_connections = (PGXCNodeHandle **) palloc(
 +                                                              combiner->conn_count * sizeof(PGXCNodeHandle *));
 +                                      memcpy(combiner->cursor_connections, combiner->connections,
 +                                                              combiner->conn_count * sizeof(PGXCNodeHandle *));
 +                              }
 +                      }
 +                      else
 +                      {
 +                              combiner->cursor = pstrdup(cursor);
 +                              combiner->cursor_count = combiner->conn_count;
 +                              combiner->cursor_connections = (PGXCNodeHandle **) palloc(
 +                                                      combiner->conn_count * sizeof(PGXCNodeHandle *));
 +                              memcpy(combiner->cursor_connections, combiner->connections,
 +                                                      combiner->conn_count * sizeof(PGXCNodeHandle *));
 +                      }
 +              }
 +
 +              if (combiner->merge_sort)
 +              {
 +                      /*
 +                       * Requests are already made and sorter can fetch tuples to populate
 +                       * sort buffer.
 +                       */
 +                      combiner->tuplesortstate = tuplesort_begin_merge(
 +                                                                         resultslot->tts_tupleDescriptor,
 +                                                                         plan->sort->numCols,
 +                                                                         plan->sort->sortColIdx,
 +                                                                         plan->sort->sortOperators,
 +                                                                         plan->sort->sortCollations,
 +                                                                         plan->sort->nullsFirst,
 +                                                                         combiner,
 +                                                                         work_mem);
 +              }
 +              if (primary_mode)
 +              {
 +                      if (combiner->probing_primary)
 +                      {
 +                              combiner->probing_primary = false;
 +                              node->bound = true;
 +                      }
 +                      else
 +                              combiner->probing_primary = true;
 +              }
 +              else
 +                      node->bound = true;
 +      }
 +
 +      if (combiner->tuplesortstate)
 +      {
 +              if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
++                                                                 true, true, resultslot, NULL))
 +              {
 +                      if (log_remotesubplan_stats)
 +                              ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
 +                      return resultslot;
 +              }
 +      }
 +      else
 +      {
 +              TupleTableSlot *slot = FetchTuple(combiner);
 +              if (!TupIsNull(slot))
 +              {
 +                      if (log_remotesubplan_stats)
 +                              ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
 +                      return slot;
 +              }
 +              else if (combiner->probing_primary)
 +                      /* phase1 is successfully completed, run on other nodes */
 +                      goto primary_mode_phase_two;
 +      }
 +      if (combiner->errorMessage)
 +              pgxc_node_report_error(combiner);
 +
 +      if (log_remotesubplan_stats)
 +              ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
 +
 +      return NULL;
 +}
 +
 +
 +void
 +ExecReScanRemoteSubplan(RemoteSubplanState *node)
 +{
 +      ResponseCombiner *combiner = (ResponseCombiner *)node;
 +
 +      /*
 +       * If we haven't queried remote nodes yet, just return. If outerplan'
 +       * chgParam is not NULL then it will be re-scanned by ExecProcNode,
 +       * else - no reason to re-scan it at all.
 +       */
 +      if (!node->bound)
 +              return;
 +
 +      /*
 +       * If we execute locally rescan local copy of the plan
 +       */
 +      if (outerPlanState(node))
 +              ExecReScan(outerPlanState(node));
 +
 +      /*
 +       * Consume any possible pending input
 +       */
 +      pgxc_connections_cleanup(combiner);
 +
 +      /* misc cleanup */
 +      combiner->command_complete_count = 0;
 +      combiner->description_count = 0;
 +
 +      /*
 +       * Force query is re-bound with new parameters
 +       */
 +      node->bound = false;
 +}
 +
 +
 +void
 +ExecEndRemoteSubplan(RemoteSubplanState *node)
 +{
 +      ResponseCombiner *combiner = (ResponseCombiner *)node;
 +      RemoteSubplan    *plan = (RemoteSubplan *) combiner->ss.ps.plan;
 +      int i;
 +      struct rusage   start_r;
 +      struct timeval          start_t;
 +
 +      if (log_remotesubplan_stats)
 +              ResetUsageCommon(&start_r, &start_t);
 +
 +      if (outerPlanState(node))
 +              ExecEndNode(outerPlanState(node));
 +      if (node->locator)
 +              freeLocator(node->locator);
 +
 +      /*
 +       * Consume any possible pending input
 +       */
 +      if (node->bound)
 +              pgxc_connections_cleanup(combiner);
 +
 +      /*
 +       * Update coordinator statistics
 +       */
 +      if (IS_PGXC_COORDINATOR)
 +      {
 +              EState *estate = combiner->ss.ps.state;
 +
 +              if (estate->es_num_result_relations > 0 && estate->es_processed > 0)
 +              {
 +                      switch (estate->es_plannedstmt->commandType)
 +                      {
 +                              case CMD_INSERT:
 +                                      /* One statement can insert into only one relation */
 +                                      pgstat_count_remote_insert(
 +                                                              estate->es_result_relations[0].ri_RelationDesc,
 +                                                              estate->es_processed);
 +                                      break;
 +                              case CMD_UPDATE:
 +                              case CMD_DELETE:
 +                                      {
 +                                              /*
 +                                               * We can not determine here how many row were updated
 +                                               * or delete in each table, so assume same number of
 +                                               * affected row in each table.
 +                                               * If resulting number of rows is 0 because of rounding,
 +                                               * increment each counter at least on 1.
 +                                               */
 +                                              int             i;
 +                                              int     n;
 +                                              bool    update;
 +
 +                                              update = (estate->es_plannedstmt->commandType == CMD_UPDATE);
 +                                              n = estate->es_processed / estate->es_num_result_relations;
 +                                              if (n == 0)
 +                                                      n = 1;
 +                                              for (i = 0; i < estate->es_num_result_relations; i++)
 +                                              {
 +                                                      Relation r;
 +                                                      r = estate->es_result_relations[i].ri_RelationDesc;
 +                                                      if (update)
 +                                                              pgstat_count_remote_update(r, n);
 +                                                      else
 +                                                              pgstat_count_remote_delete(r, n);
 +                                              }
 +                                      }
 +                                      break;
 +                              default:
 +                                      /* nothing to count */
 +                                      break;
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Close portals. While cursors_connections exist there are open portals
 +       */
 +      if (combiner->cursor)
 +      {
 +              /* Restore connections where there are active statements */
 +              combiner->conn_count = combiner->cursor_count;
 +              memcpy(combiner->connections, combiner->cursor_connections,
 +                                      combiner->cursor_count * sizeof(PGXCNodeHandle *));
 +              for (i = 0; i < combiner->cursor_count; i++)
 +              {
 +                      PGXCNodeHandle *conn;
 +
 +                      conn = combiner->cursor_connections[i];
 +
 +                      CHECK_OWNERSHIP(conn, combiner);
 +
 +                      if (pgxc_node_send_close(conn, false, combiner->cursor) != 0)
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                                               errmsg("Failed to close data node cursor")));
 +              }
 +              /* The cursor stuff is not needed */
 +              combiner->cursor = NULL;
 +              combiner->cursor_count = 0;
 +              pfree(combiner->cursor_connections);
 +              combiner->cursor_connections = NULL;
 +      }
 +
 +      /* Close statements, even if they never were bound */
 +      for (i = 0; i < combiner->conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn;
 +              char                    cursor[NAMEDATALEN];
 +
 +              if (plan->cursor)
 +              {
 +                      if (plan->unique)
 +                              snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
 +                      else
 +                              strncpy(cursor, plan->cursor, NAMEDATALEN);
 +              }
 +              else
 +                      cursor[0] = '\0';
 +
 +              conn = combiner->connections[i];
 +
 +              CHECK_OWNERSHIP(conn, combiner);
 +
 +              if (pgxc_node_send_close(conn, true, cursor) != 0)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to close data node statement")));
 +              /* Send SYNC and wait for ReadyForQuery */
 +              if (pgxc_node_send_sync(conn) != 0)
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to synchronize data node")));
 +              /*
 +               * Formally connection is not in QUERY state, we set the state to read
 +               * CloseDone and ReadyForQuery responses. Upon receiving ReadyForQuery
 +               * state will be changed back to IDLE and conn->coordinator will be
 +               * cleared.
 +               */
 +              PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_CLOSE);
 +      }
 +
 +      while (combiner->conn_count > 0)
 +      {
 +              if (pgxc_node_receive(combiner->conn_count,
 +                                                        combiner->connections, NULL))
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INTERNAL_ERROR),
 +                                       errmsg("Failed to close remote subplan")));
 +              i = 0;
 +              while (i < combiner->conn_count)
 +              {
 +                      int res = handle_response(combiner->connections[i], combiner);
 +                      if (res == RESPONSE_EOF)
 +                      {
 +                              i++;
 +                      }
 +                      else if (res == RESPONSE_READY)
 +                      {
 +                              /* Done, connection is reade for query */
 +                              if (--combiner->conn_count > i)
 +                                      combiner->connections[i] =
 +                                                      combiner->connections[combiner->conn_count];
 +                      }
 +                      else if (res == RESPONSE_DATAROW)
 +                      {
 +                              /*
 +                               * If we are finishing slowly running remote subplan while it
 +                               * is still working (because of Limit, for example) it may
 +                               * produce one or more tuples between connection cleanup and
 +                               * handling Close command. One tuple does not cause any problem,
 +                               * but if it will not be read the next tuple will trigger
 +                               * assertion failure. So if we got a tuple, just read and
 +                               * discard it here.
 +                               */
 +                              pfree(combiner->currentRow);
 +                              combiner->currentRow = NULL;
 +                      }
 +                      /* Ignore other possible responses */
 +              }
 +      }
 +
 +      ValidateAndCloseCombiner(combiner);
 +      pfree(node);
 +
 +      if (log_remotesubplan_stats)
 +              ShowUsageCommon("ExecEndRemoteSubplan", &start_r, &start_t);
 +}
 +
 +/*
 + * pgxc_node_report_error
 + * Throw error from Datanode if any.
 + */
 +static void
 +pgxc_node_report_error(ResponseCombiner *combiner)
 +{
 +      /* If no combiner, nothing to do */
 +      if (!combiner)
 +              return;
 +      if (combiner->errorMessage)
 +      {
 +              char *code = combiner->errorCode;
 +              if ((combiner->errorDetail == NULL) && (combiner->errorHint == NULL))
 +                      ereport(ERROR,
 +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
 +                                      errmsg("%s", combiner->errorMessage)));
 +              else if ((combiner->errorDetail != NULL) && (combiner->errorHint != NULL))
 +                      ereport(ERROR,
 +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
 +                                      errmsg("%s", combiner->errorMessage),
 +                                      errdetail("%s", combiner->errorDetail),
 +                                      errhint("%s", combiner->errorHint)));
 +              else if (combiner->errorDetail != NULL)
 +                      ereport(ERROR,
 +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
 +                                      errmsg("%s", combiner->errorMessage),
 +                                      errdetail("%s", combiner->errorDetail)));
 +              else
 +                      ereport(ERROR,
 +                                      (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
 +                                      errmsg("%s", combiner->errorMessage),
 +                                      errhint("%s", combiner->errorHint)));
 +      }
 +}
 +
 +
 +/*
 + * get_success_nodes:
 + * Currently called to print a user-friendly message about
 + * which nodes the query failed.
 + * Gets all the nodes where no 'E' (error) messages were received; i.e. where the
 + * query ran successfully.
 + */
 +static ExecNodes *
 +get_success_nodes(int node_count, PGXCNodeHandle **handles, char node_type, StringInfo failednodes)
 +{
 +      ExecNodes *success_nodes = NULL;
 +      int i;
 +
 +      for (i = 0; i < node_count; i++)
 +      {
 +              PGXCNodeHandle *handle = handles[i];
 +              int nodenum = PGXCNodeGetNodeId(handle->nodeoid, &node_type);
 +
 +              if (!handle->error)
 +              {
 +                      if (!success_nodes)
 +                              success_nodes = makeNode(ExecNodes);
 +                      success_nodes->nodeList = lappend_int(success_nodes->nodeList, nodenum);
 +              }
 +              else
 +              {
 +                      if (failednodes->len == 0)
 +                              appendStringInfo(failednodes, "Error message received from nodes:");
 +                      appendStringInfo(failednodes, " %s#%d",
 +                              (node_type == PGXC_NODE_COORDINATOR ? "coordinator" : "datanode"),
 +                              nodenum + 1);
 +              }
 +      }
 +      return success_nodes;
 +}
 +
 +/*
 + * pgxc_all_success_nodes: Uses get_success_nodes() to collect the
 + * user-friendly message from coordinator as well as datanode.
 + */
 +void
 +pgxc_all_success_nodes(ExecNodes **d_nodes, ExecNodes **c_nodes, char **failednodes_msg)
 +{
 +      PGXCNodeAllHandles *connections = get_exec_connections(NULL, NULL, EXEC_ON_ALL_NODES, true);
 +      StringInfoData failednodes;
 +      initStringInfo(&failednodes);
 +
 +      *d_nodes = get_success_nodes(connections->dn_conn_count,
 +                                   connections->datanode_handles,
 +                                                               PGXC_NODE_DATANODE,
 +                                                               &failednodes);
 +
 +      *c_nodes = get_success_nodes(connections->co_conn_count,
 +                                   connections->coord_handles,
 +                                                               PGXC_NODE_COORDINATOR,
 +                                                               &failednodes);
 +
 +      if (failednodes.len == 0)
 +              *failednodes_msg = NULL;
 +      else
 +              *failednodes_msg = failednodes.data;
 +
 +      pfree_pgxc_all_handles(connections);
 +}
 +
 +
 +/*
 + * set_dbcleanup_callback:
 + * Register a callback function which does some non-critical cleanup tasks
 + * on xact success or abort, such as tablespace/database directory cleanup.
 + */
 +void set_dbcleanup_callback(xact_callback function, void *paraminfo, int paraminfo_size)
 +{
 +      void *fparams;
 +
 +      fparams = MemoryContextAlloc(TopMemoryContext, paraminfo_size);
 +      memcpy(fparams, paraminfo, paraminfo_size);
 +
 +      dbcleanup_info.function = function;
 +      dbcleanup_info.fparams = fparams;
 +}
 +
 +/*
 + * AtEOXact_DBCleanup: To be called at post-commit or pre-abort.
 + * Calls the cleanup function registered during this transaction, if any.
 + */
 +void AtEOXact_DBCleanup(bool isCommit)
 +{
 +      if (dbcleanup_info.function)
 +              (*dbcleanup_info.function)(isCommit, dbcleanup_info.fparams);
 +
 +      /*
 +       * Just reset the callbackinfo. We anyway don't want this to be called again,
 +       * until explicitly set.
 +       */
 +      dbcleanup_info.function = NULL;
 +      if (dbcleanup_info.fparams)
 +      {
 +              pfree(dbcleanup_info.fparams);
 +              dbcleanup_info.fparams = NULL;
 +      }
 +}
 +
 +char *
 +GetImplicit2PCGID(const char *implicit2PC_head, bool localWrite)
 +{
 +      int dnCount = 0, coordCount = 0;
 +      int dnNodeIds[MaxDataNodes];
 +      int coordNodeIds[MaxCoords];
 +      MemoryContext oldContext = CurrentMemoryContext;
 +      StringInfoData str;
 +      int i;
 +
 +      oldContext = MemoryContextSwitchTo(TopTransactionContext);
 +      initStringInfo(&str);
 +      /*
 +       * Check how many coordinators and datanodes are involved in this
 +       * transaction
 +       */
 +      pgxc_node_remote_count(&dnCount, dnNodeIds, &coordCount, coordNodeIds);
 +      appendStringInfo(&str, "%s%u:%s:%c:%d:%d",
 +                      implicit2PC_head,
 +                      GetTopTransactionId(),
 +                      PGXCNodeName,
 +                      localWrite ? 'T' : 'F',
 +                      dnCount,
 +                      coordCount + (localWrite ? 1 : 0));
 +
 +      for (i = 0; i < dnCount; i++)
 +              appendStringInfo(&str, ":%d", dnNodeIds[i]);
 +      for (i = 0; i < coordCount; i++)
 +              appendStringInfo(&str, ":%d", coordNodeIds[i]);
 +
 +      if (localWrite)
 +              appendStringInfo(&str, ":%d", PGXCNodeIdentifier);
 +
 +      MemoryContextSwitchTo(oldContext);
 +
 +      return str.data;
 +}
index eafd9cbbe0b927ccd6f53266a94ff2aef61af020,0000000000000000000000000000000000000000..809da4f1d2fa77147e948234868290a990f1a224
mode 100644,000000..100644
--- /dev/null
@@@ -1,3228 -1,0 +1,3229 @@@
-       PG_RETURN_NAME(PGXCNodeName);
 +/*-------------------------------------------------------------------------
 + *
 + * pgxcnode.c
 + *
 + *      Functions for the Coordinator communicating with the PGXC nodes:
 + *      Datanodes and Coordinators
 + *
 + *
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
 + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + * IDENTIFICATION
 + *      $$
 + *
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include "postgres.h"
 +#include <poll.h>
 +
 +#ifdef __sun
 +#include <sys/filio.h>
 +#endif
 +
 +#include <sys/time.h>
 +#include <sys/types.h>
 +#include <sys/ioctl.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <unistd.h>
 +#include <errno.h>
 +#include "access/gtm.h"
 +#include "access/transam.h"
 +#include "access/xact.h"
 +#include "access/htup_details.h"
 +#include "catalog/pg_type.h"
 +#include "commands/prepare.h"
 +#include "gtm/gtm_c.h"
 +#include "nodes/nodes.h"
 +#include "pgxc/pgxcnode.h"
 +#include "pgxc/execRemote.h"
 +#include "catalog/pgxc_node.h"
 +#include "catalog/pg_collation.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/pgxc.h"
 +#include "pgxc/poolmgr.h"
 +#include "tcop/dest.h"
++#include "storage/lwlock.h"
 +#include "utils/builtins.h"
 +#include "utils/elog.h"
 +#include "utils/memutils.h"
 +#include "utils/fmgroids.h"
 +#include "utils/snapmgr.h"
 +#include "utils/syscache.h"
 +#include "utils/lsyscache.h"
 +#include "utils/formatting.h"
 +#include "utils/tqual.h"
 +#include "../interfaces/libpq/libpq-fe.h"
 +#ifdef XCP
 +#include "miscadmin.h"
 +#include "storage/ipc.h"
 +#include "pgxc/pause.h"
 +#include "utils/snapmgr.h"
 +#endif
 +
 +#define CMD_ID_MSG_LEN 8
 +
 +/* Number of connections held */
 +static int    datanode_count = 0;
 +static int    coord_count = 0;
 +
 +/*
 + * Datanode handles saved in Transaction memory context
 + * when PostgresMain is launched.
 + * Those handles are used inside a transaction by Coordinator to Datanodes.
 + */
 +static PGXCNodeHandle *dn_handles = NULL;
 +
 +/*
 + * Coordinator handles saved in Transaction memory context
 + * when PostgresMain is launched.
 + * Those handles are used inside a transaction by Coordinator to Coordinators
 + */
 +static PGXCNodeHandle *co_handles = NULL;
 +
 +/* Current size of dn_handles and co_handles */
 +int                   NumDataNodes;
 +int           NumCoords;
 +
 +
 +#ifdef XCP
 +volatile bool HandlesInvalidatePending = false;
 +volatile bool HandlesRefreshPending = false;
 +
 +/*
 + * Session and transaction parameters need to to be set on newly connected
 + * remote nodes.
 + */
 +static List *session_param_list = NIL;
 +static List   *local_param_list = NIL;
 +static StringInfo     session_params;
 +static StringInfo     local_params;
 +
 +typedef struct
 +{
 +      NameData name;
 +      NameData value;
 +      int              flags;
 +} ParamEntry;
 +
 +
 +static bool DoInvalidateRemoteHandles(void);
 +static bool DoRefreshRemoteHandles(void);
 +#endif
 +
 +#ifdef XCP
 +static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
 +              bool global_session, int pid);
 +#else
 +static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
 +#endif
 +static void pgxc_node_free(PGXCNodeHandle *handle);
 +static void pgxc_node_all_free(void);
 +
 +static int    get_int(PGXCNodeHandle * conn, size_t len, int *out);
 +static int    get_char(PGXCNodeHandle * conn, char *out);
 +
 +
 +/*
 + * Initialize PGXCNodeHandle struct
 + */
 +static void
 +init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
 +{
 +      /*
 +       * Socket descriptor is small non-negative integer,
 +       * Indicate the handle is not initialized yet
 +       */
 +      pgxc_handle->sock = NO_SOCKET;
 +
 +      /* Initialise buffers */
 +      pgxc_handle->error = NULL;
 +      pgxc_handle->outSize = 16 * 1024;
 +      pgxc_handle->outBuffer = (char *) palloc(pgxc_handle->outSize);
 +      pgxc_handle->inSize = 16 * 1024;
 +
 +      pgxc_handle->inBuffer = (char *) palloc(pgxc_handle->inSize);
 +      pgxc_handle->combiner = NULL;
 +      pgxc_handle->inStart = 0;
 +      pgxc_handle->inEnd = 0;
 +      pgxc_handle->inCursor = 0;
 +      pgxc_handle->outEnd = 0;
 +      pgxc_handle->needSync = false;
 +
 +      if (pgxc_handle->outBuffer == NULL || pgxc_handle->inBuffer == NULL)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +}
 +
 +
 +/*
 + * Allocate and initialize memory to store Datanode and Coordinator handles.
 + */
 +void
 +InitMultinodeExecutor(bool is_force)
 +{
 +      int                             count;
 +      Oid                             *coOids, *dnOids;
 +#ifdef XCP
 +      MemoryContext   oldcontext;
 +#endif
 +
 +
 +      /* Free all the existing information first */
 +      if (is_force)
 +              pgxc_node_all_free();
 +
 +      /* This function could get called multiple times because of sigjmp */
 +      if (dn_handles != NULL &&
 +              co_handles != NULL)
 +              return;
 +
 +      /* Update node table in the shared memory */
 +      PgxcNodeListAndCount();
 +
 +      /* Get classified list of node Oids */
 +      PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true);
 +
 +#ifdef XCP
 +      /*
 +       * Coordinator and datanode handles should be available during all the
 +       * session lifetime
 +       */
 +      oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 +#endif
 +
 +      /* Do proper initialization of handles */
 +      if (NumDataNodes > 0)
 +              dn_handles = (PGXCNodeHandle *)
 +                      palloc(NumDataNodes * sizeof(PGXCNodeHandle));
 +      if (NumCoords > 0)
 +              co_handles = (PGXCNodeHandle *)
 +                      palloc(NumCoords * sizeof(PGXCNodeHandle));
 +
 +      if ((!dn_handles && NumDataNodes > 0) ||
 +              (!co_handles && NumCoords > 0))
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory for node handles")));
 +
 +      /* Initialize new empty slots */
 +      for (count = 0; count < NumDataNodes; count++)
 +      {
 +              init_pgxc_handle(&dn_handles[count]);
 +              dn_handles[count].nodeoid = dnOids[count];
 +              dn_handles[count].nodeid = get_pgxc_node_id(dnOids[count]);
 +              strncpy(dn_handles[count].nodename, get_pgxc_nodename(dnOids[count]),
 +                              NAMEDATALEN);
 +              strncpy(dn_handles[count].nodehost, get_pgxc_nodehost(dnOids[count]),
 +                              NAMEDATALEN);
 +              dn_handles[count].nodeport = get_pgxc_nodeport(dnOids[count]);
 +      }
 +      for (count = 0; count < NumCoords; count++)
 +      {
 +              init_pgxc_handle(&co_handles[count]);
 +              co_handles[count].nodeoid = coOids[count];
 +              co_handles[count].nodeid = get_pgxc_node_id(coOids[count]);
 +              strncpy(co_handles[count].nodename, get_pgxc_nodename(coOids[count]),
 +                              NAMEDATALEN);
 +              strncpy(co_handles[count].nodehost, get_pgxc_nodehost(coOids[count]),
 +                              NAMEDATALEN);
 +              co_handles[count].nodeport = get_pgxc_nodeport(coOids[count]);
 +      }
 +
 +      datanode_count = 0;
 +      coord_count = 0;
 +      PGXCNodeId = 0;
 +
 +      MemoryContextSwitchTo(oldcontext);
 +
 +      if (IS_PGXC_COORDINATOR)
 +      {
 +              for (count = 0; count < NumCoords; count++)
 +              {
 +                      if (pg_strcasecmp(PGXCNodeName,
 +                                         get_pgxc_nodename(co_handles[count].nodeoid)) == 0)
 +                              PGXCNodeId = count + 1;
 +              }
 +      }
 +      else /* DataNode */
 +      {
 +              for (count = 0; count < NumDataNodes; count++)
 +              {
 +                      if (pg_strcasecmp(PGXCNodeName,
 +                                         get_pgxc_nodename(dn_handles[count].nodeoid)) == 0)
 +                              PGXCNodeId = count + 1;
 +              }
 +      }
 +}
 +
 +/*
 + * Builds up a connection string
 + */
 +char *
 +PGXCNodeConnStr(char *host, int port, char *dbname,
 +                              char *user, char *pgoptions, char *remote_type, char *parent_node)
 +{
 +      char       *out,
 +                              connstr[1024];
 +      int                     num;
 +
 +      /*
 +       * Build up connection string
 +       * remote type can be Coordinator, Datanode or application.
 +       */
 +      num = snprintf(connstr, sizeof(connstr),
 +                                 "host=%s port=%d dbname=%s user=%s application_name='pgxc:%s' sslmode=disable options='-c remotetype=%s -c parentnode=%s %s'",
 +                                 host, port, dbname, user, parent_node, remote_type, parent_node,
 +                                 pgoptions);
 +
 +      /* Check for overflow */
 +      if (num > 0 && num < sizeof(connstr))
 +      {
 +              /* Output result */
 +              out = (char *) palloc(num + 1);
 +              strcpy(out, connstr);
 +              return out;
 +      }
 +
 +      /* return NULL if we have problem */
 +      return NULL;
 +}
 +
 +
 +/*
 + * Connect to a Datanode using a connection string
 + */
 +NODE_CONNECTION *
 +PGXCNodeConnect(char *connstr)
 +{
 +      PGconn     *conn;
 +
 +      /* Delegate call to the pglib */
 +      conn = PQconnectdb(connstr);
 +      return (NODE_CONNECTION *) conn;
 +}
 +
 +int PGXCNodePing(const char *connstr)
 +{
 +      if (connstr[0])
 +      {
 +              PGPing status = PQping(connstr);
 +              if (status == PQPING_OK)
 +                      return 0;
 +              else
 +                      return 1;
 +      }
 +      else
 +              return -1;
 +}
 +
 +/*
 + * Close specified connection
 + */
 +void
 +PGXCNodeClose(NODE_CONNECTION *conn)
 +{
 +      /* Delegate call to the pglib */
 +      PQfinish((PGconn *) conn);
 +}
 +
 +/*
 + * Checks if connection active
 + */
 +int
 +PGXCNodeConnected(NODE_CONNECTION *conn)
 +{
 +      /* Delegate call to the pglib */
 +      PGconn     *pgconn = (PGconn *) conn;
 +
 +      /*
 +       * Simple check, want to do more comprehencive -
 +       * check if it is ready for guery
 +       */
 +      return pgconn && PQstatus(pgconn) == CONNECTION_OK;
 +}
 +
 +
 +
 +/* Close the socket handle (this process' copy) and free occupied memory
 + *
 + * Note that we do not free the handle and its members. This will be
 + * taken care of when the transaction ends, when TopTransactionContext
 + * is destroyed in xact.c.
 + */
 +static void
 +pgxc_node_free(PGXCNodeHandle *handle)
 +{
 +      if (handle->sock != NO_SOCKET)
 +              close(handle->sock);
 +      handle->sock = NO_SOCKET;
 +}
 +
 +/*
 + * Free all the node handles cached
 + */
 +static void
 +pgxc_node_all_free(void)
 +{
 +      int i, j;
 +
 +      for (i = 0; i < 2; i++)
 +      {
 +              int num_nodes = 0;
 +              PGXCNodeHandle *array_handles;
 +
 +              switch (i)
 +              {
 +                      case 0:
 +                              num_nodes = NumCoords;
 +                              array_handles = co_handles;
 +                              break;
 +                      case 1:
 +                              num_nodes = NumDataNodes;
 +                              array_handles = dn_handles;
 +                              break;
 +                      default:
 +                              Assert(0);
 +              }
 +
 +              for (j = 0; j < num_nodes; j++)
 +              {
 +                      PGXCNodeHandle *handle = &array_handles[j];
 +                      pgxc_node_free(handle);
 +              }
 +              if (array_handles)
 +                      pfree(array_handles);
 +      }
 +
 +      co_handles = NULL;
 +      dn_handles = NULL;
 +      HandlesInvalidatePending = false;
 +      HandlesRefreshPending = false;
 +}
 +
 +/*
 + * Create and initialise internal structure to communicate to
 + * Datanode via supplied socket descriptor.
 + * Structure stores state info and I/O buffers
 + */
 +static void
 +pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
 +{
 +      char *init_str;
 +
 +      handle->sock = sock;
 +      handle->backend_pid = pid;
 +      handle->transaction_status = 'I';
 +      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_IDLE);
 +      handle->read_only = true;
 +      handle->ck_resp_rollback = false;
 +      handle->combiner = NULL;
 +#ifdef DN_CONNECTION_DEBUG
 +      handle->have_row_desc = false;
 +#endif
 +      handle->error = NULL;
 +      handle->outEnd = 0;
 +      handle->inStart = 0;
 +      handle->inEnd = 0;
 +      handle->inCursor = 0;
 +      handle->needSync = false;
 +      /*
 +       * We got a new connection, set on the remote node the session parameters
 +       * if defined. The transaction parameter should be sent after BEGIN
 +       */
 +      if (global_session)
 +      {
 +              init_str = PGXCNodeGetSessionParamStr();
 +              if (init_str)
 +              {
 +                      pgxc_node_set_query(handle, init_str);
 +              }
 +      }
 +}
 +
 +
 +/*
 + * Wait while at least one of specified connections has data available and read
 + * the data into the buffer
 + */
 +bool
 +pgxc_node_receive(const int conn_count,
 +                                PGXCNodeHandle ** connections, struct timeval * timeout)
 +{
 +#define ERROR_OCCURED         true
 +#define NO_ERROR_OCCURED      false
 +      int             i,
 +                      sockets_to_poll,
 +                      poll_val;
 +      bool    is_msg_buffered;
 +      long    timeout_ms;
 +      struct  pollfd pool_fd[conn_count];
 +
 +      /* sockets to be polled index */
 +      sockets_to_poll = 0;
 +
 +      is_msg_buffered = false;
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              /* If connection has a buffered message */
 +              if (HAS_MESSAGE_BUFFERED(connections[i]))
 +              {
 +                      is_msg_buffered = true;
 +                      break;
 +              }
 +      }
 +
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              /* If connection finished sending do not wait input from it */
 +              if (connections[i]->state == DN_CONNECTION_STATE_IDLE || HAS_MESSAGE_BUFFERED(connections[i]))
 +              {
 +                      pool_fd[i].fd = -1;
 +                      pool_fd[i].events = 0;
 +                      continue;
 +              }
 +
 +              /* prepare select params */
 +              if (connections[i]->sock > 0)
 +              {
 +                      pool_fd[i].fd = connections[i]->sock;
 +                      pool_fd[i].events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
 +                      sockets_to_poll++;
 +              }
 +              else
 +              {
 +                      /* flag as bad, it will be removed from the list */
 +                      PGXCNodeSetConnectionState(connections[i],
 +                                      DN_CONNECTION_STATE_ERROR_FATAL);
 +                      pool_fd[i].fd = -1;
 +                      pool_fd[i].events = 0;
 +              }
 +      }
 +
 +      /*
 +       * Return if we do not have connections to receive input
 +       */
 +      if (sockets_to_poll == 0)
 +      {
 +              if (is_msg_buffered)
 +                      return NO_ERROR_OCCURED;
 +              return ERROR_OCCURED;
 +      }
 +
 +      /* do conversion from the select behaviour */
 +      if ( timeout == NULL )
 +              timeout_ms = -1;
 +      else
 +              timeout_ms = (timeout->tv_sec * (uint64_t) 1000) + (timeout->tv_usec / 1000);
 +
 +retry:
 +      CHECK_FOR_INTERRUPTS();
 +      poll_val  = poll(pool_fd, conn_count, timeout_ms);
 +      if (poll_val < 0)
 +      {
 +              /* error - retry if EINTR */
 +              if (errno == EINTR  || errno == EAGAIN)
 +                      goto retry;
 +
 +              elog(WARNING, "poll() error: %d", errno);
 +              if (errno)
 +                      return ERROR_OCCURED;
 +              return NO_ERROR_OCCURED;
 +      }
 +
 +      if (poll_val == 0)
 +      {
 +              /* Handle timeout */
 +              elog(DEBUG1, "timeout %ld while waiting for any response from %d connections", timeout_ms,conn_count);
 +              for (i = 0; i < conn_count; i++)
 +                      PGXCNodeSetConnectionState(connections[i],
 +                                      DN_CONNECTION_STATE_ERROR_FATAL);
 +              return NO_ERROR_OCCURED;
 +      }
 +
 +      /* read data */
 +      for (i = 0; i < conn_count; i++)
 +      {
 +              PGXCNodeHandle *conn = connections[i];
 +
 +              if( pool_fd[i].fd == -1 )
 +                      continue;
 +
 +              if ( pool_fd[i].fd == conn->sock )
 +              {
 +                      if( pool_fd[i].revents & POLLIN )
 +                      {
 +                              int     read_status = pgxc_node_read_data(conn, true);
 +                              if ( read_status == EOF || read_status < 0 )
 +                              {
 +                                      /* Can not read - no more actions, just discard connection */
 +                                      PGXCNodeSetConnectionState(conn,
 +                                                      DN_CONNECTION_STATE_ERROR_FATAL);
 +                                      add_error_message(conn, "unexpected EOF on datanode connection.");
 +                                      elog(WARNING, "unexpected EOF on datanode oid connection: %d", conn->nodeoid);
 +
 +                                      /*
 +                                       * before returning, also update the shared health
 +                                       * status field to indicate that this node could be
 +                                       * possibly unavailable.
 +                                       *
 +                                       * Note that this error could be due to a stale handle
 +                                       * and it's possible that another backend might have
 +                                       * already updated the health status OR the node
 +                                       * might have already come back since the last disruption
 +                                       */
 +                                      PoolPingNodeRecheck(conn->nodeoid);
 +
 +                                      /* Should we read from the other connections before returning? */
 +                                      return ERROR_OCCURED;
 +                              }
 +
 +                      }
 +                      else if (
 +                                      (pool_fd[i].revents & POLLERR) ||
 +                                      (pool_fd[i].revents & POLLHUP) ||
 +                                      (pool_fd[i].revents & POLLNVAL)
 +                                      )
 +                      {
 +                              PGXCNodeSetConnectionState(connections[i],
 +                                              DN_CONNECTION_STATE_ERROR_FATAL);
 +                              add_error_message(conn, "unexpected network error on datanode connection");
 +                              elog(WARNING, "unexpected EOF on datanode oid connection: %d with event %d", conn->nodeoid,pool_fd[i].revents);
 +                              /* Should we check/read from the other connections before returning? */
 +                              return ERROR_OCCURED;
 +                      }
 +              }
 +      }
 +      return NO_ERROR_OCCURED;
 +}
 +
 +/*
 + * Is there any data enqueued in the TCP input buffer waiting
 + * to be read sent by the PGXC node connection
 + */
 +
 +int
 +pgxc_node_is_data_enqueued(PGXCNodeHandle *conn)
 +{
 +      int ret;
 +      int enqueued;
 +
 +      if (conn->sock < 0)
 +              return 0;
 +      ret = ioctl(conn->sock, FIONREAD, &enqueued);
 +      if (ret != 0)
 +              return 0;
 +
 +      return enqueued;
 +}
 +
 +/*
 + * Read up incoming messages from the PGXC node connection
 + */
 +int
 +pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error)
 +{
 +      int                     someread = 0;
 +      int                     nread;
 +
 +      if (conn->sock < 0)
 +      {
 +              if (close_if_error)
 +                      add_error_message(conn, "bad socket");
 +              return EOF;
 +      }
 +
 +      /* Left-justify any data in the buffer to make room */
 +      if (conn->inStart < conn->inEnd)
 +      {
 +              if (conn->inStart > 0)
 +              {
 +                      memmove(conn->inBuffer, conn->inBuffer + conn->inStart,
 +                                      conn->inEnd - conn->inStart);
 +                      conn->inEnd -= conn->inStart;
 +                      conn->inCursor -= conn->inStart;
 +                      conn->inStart = 0;
 +              }
 +      }
 +      else
 +      {
 +              /* buffer is logically empty, reset it */
 +              conn->inStart = conn->inCursor = conn->inEnd = 0;
 +      }
 +
 +      /*
 +       * If the buffer is fairly full, enlarge it. We need to be able to enlarge
 +       * the buffer in case a single message exceeds the initial buffer size. We
 +       * enlarge before filling the buffer entirely so as to avoid asking the
 +       * kernel for a partial packet. The magic constant here should be large
 +       * enough for a TCP packet or Unix pipe bufferload.  8K is the usual pipe
 +       * buffer size, so...
 +       */
 +      if (conn->inSize - conn->inEnd < 8192)
 +      {
 +              if (ensure_in_buffer_capacity(conn->inEnd + (size_t) 8192, conn) != 0)
 +              {
 +                      /*
 +                       * We don't insist that the enlarge worked, but we need some room
 +                       */
 +                      if (conn->inSize - conn->inEnd < 100)
 +                      {
 +                              if (close_if_error)
 +                                      add_error_message(conn, "can not allocate buffer");
 +                              return -1;
 +                      }
 +              }
 +      }
 +
 +retry:
 +      nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
 +                               conn->inSize - conn->inEnd, 0);
 +
 +      if (nread < 0)
 +      {
 +              if (errno == EINTR)
 +                      goto retry;
 +              /* Some systems return EAGAIN/EWOULDBLOCK for no data */
 +#ifdef EAGAIN
 +              if (errno == EAGAIN)
 +                      return someread;
 +#endif
 +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
 +              if (errno == EWOULDBLOCK)
 +                      return someread;
 +#endif
 +              /* We might get ECONNRESET here if using TCP and backend died */
 +#ifdef ECONNRESET
 +              if (errno == ECONNRESET)
 +              {
 +                      /*
 +                       * OK, we are getting a zero read even though select() says ready. This
 +                       * means the connection has been closed.  Cope.
 +                       */
 +                      if (close_if_error)
 +                      {
 +                              add_error_message(conn,
 +                                                              "Datanode closed the connection unexpectedly\n"
 +                                      "\tThis probably means the Datanode terminated abnormally\n"
 +                                                              "\tbefore or while processing the request.\n");
 +                              PGXCNodeSetConnectionState(conn,
 +                                              DN_CONNECTION_STATE_ERROR_FATAL);       /* No more connection to
 +                                                                                                                      * backend */
 +                              closesocket(conn->sock);
 +                              conn->sock = NO_SOCKET;
 +                      }
 +                      return -1;
 +              }
 +#endif
 +              if (close_if_error)
 +                      add_error_message(conn, "could not receive data from server");
 +              return -1;
 +
 +      }
 +
 +      if (nread > 0)
 +      {
 +              conn->inEnd += nread;
 +
 +              /*
 +               * Hack to deal with the fact that some kernels will only give us back
 +               * 1 packet per recv() call, even if we asked for more and there is
 +               * more available.      If it looks like we are reading a long message,
 +               * loop back to recv() again immediately, until we run out of data or
 +               * buffer space.  Without this, the block-and-restart behavior of
 +               * libpq's higher levels leads to O(N^2) performance on long messages.
 +               *
 +               * Since we left-justified the data above, conn->inEnd gives the
 +               * amount of data already read in the current message.  We consider
 +               * the message "long" once we have acquired 32k ...
 +               */
 +              if (conn->inEnd > 32768 &&
 +                      (conn->inSize - conn->inEnd) >= 8192)
 +              {
 +                      someread = 1;
 +                      goto retry;
 +              }
 +              return 1;
 +      }
 +
 +      if (nread == 0)
 +      {
 +              if (close_if_error)
 +                      elog(DEBUG1, "nread returned 0");
 +              return EOF;
 +      }
 +
 +      if (someread)
 +              return 1;                               /* got a zero read after successful tries */
 +
 +      return 0;
 +}
 +
 +
 +/*
 + * Get one character from the connection buffer and advance cursor
 + */
 +static int
 +get_char(PGXCNodeHandle * conn, char *out)
 +{
 +      if (conn->inCursor < conn->inEnd)
 +      {
 +              *out = conn->inBuffer[conn->inCursor++];
 +              return 0;
 +      }
 +      return EOF;
 +}
 +
 +/*
 + * Read an integer from the connection buffer and advance cursor
 + */
 +static int
 +get_int(PGXCNodeHandle *conn, size_t len, int *out)
 +{
 +      unsigned short tmp2;
 +      unsigned int tmp4;
 +
 +      if (conn->inCursor + len > conn->inEnd)
 +              return EOF;
 +
 +      switch (len)
 +      {
 +              case 2:
 +                      memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2);
 +                      conn->inCursor += 2;
 +                      *out = (int) ntohs(tmp2);
 +                      break;
 +              case 4:
 +                      memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4);
 +                      conn->inCursor += 4;
 +                      *out = (int) ntohl(tmp4);
 +                      break;
 +              default:
 +                      add_error_message(conn, "not supported int size");
 +                      return EOF;
 +      }
 +
 +      return 0;
 +}
 +
 +
 +/*
 + * get_message
 + * If connection has enough data read entire message from the connection buffer
 + * and returns message type. Message data and data length are returned as
 + * var parameters.
 + * If buffer does not have enough data leaves cursor unchanged, changes
 + * connection status to DN_CONNECTION_STATE_QUERY indicating it needs to
 + * receive more and returns \0
 + * conn - connection to read from
 + * len - returned length of the data where msg is pointing to
 + * msg - returns pointer to memory in the incoming buffer. The buffer probably
 + * will be overwritten upon next receive, so if caller wants to refer it later
 + * it should make a copy.
 + */
 +char
 +get_message(PGXCNodeHandle *conn, int *len, char **msg)
 +{
 +      char            msgtype;
 +
 +      if (get_char(conn, &msgtype) || get_int(conn, 4, len))
 +      {
 +              /* Successful get_char would move cursor, restore position */
 +              conn->inCursor = conn->inStart;
 +              return '\0';
 +      }
 +
 +      *len -= 4;
 +
 +      if (conn->inCursor + *len > conn->inEnd)
 +      {
 +              /*
 +               * Not enough data in the buffer, we should read more.
 +               * Reading function will discard already consumed data in the buffer
 +               * till conn->inBegin. Then we want the message that is partly in the
 +               * buffer now has been read completely, to avoid extra read/handle
 +               * cycles. The space needed is 1 byte for message type, 4 bytes for
 +               * message length and message itself which size is currently in *len.
 +               * The buffer may already be large enough, in this case the function
 +               * ensure_in_buffer_capacity() will immediately return
 +               */
 +              ensure_in_buffer_capacity(5 + (size_t) *len, conn);
 +              conn->inCursor = conn->inStart;
 +              return '\0';
 +      }
 +
 +      *msg = conn->inBuffer + conn->inCursor;
 +      conn->inCursor += *len;
 +      conn->inStart = conn->inCursor;
 +      return msgtype;
 +}
 +
 +
 +/*
 + * Release all Datanode and Coordinator connections
 + * back to pool and release occupied memory
 + */
 +void
 +release_handles(void)
 +{
 +      bool            destroy = false;
 +      int                     i;
 +
 +      if (HandlesInvalidatePending)
 +      {
 +              DoInvalidateRemoteHandles();
 +              return;
 +      }
 +
 +      /* don't free connection if holding a cluster lock */
 +      if (cluster_ex_lock_held)
 +              return;
 +
 +      if (datanode_count == 0 && coord_count == 0)
 +              return;
 +
 +      /* Do not release connections if we have prepared statements on nodes */
 +      if (HaveActiveDatanodeStatements())
 +              return;
 +
 +      /* Free Datanodes handles */
 +      for (i = 0; i < NumDataNodes; i++)
 +      {
 +              PGXCNodeHandle *handle = &dn_handles[i];
 +
 +              if (handle->sock != NO_SOCKET)
 +              {
 +                      /*
 +                       * Connections at this point should be completely inactive,
 +                       * otherwise abaandon them. We can not allow not cleaned up
 +                       * connection is returned to pool.
 +                       */
 +                      if (handle->state != DN_CONNECTION_STATE_IDLE ||
 +                                      handle->transaction_status != 'I')
 +                      {
 +                              destroy = true;
 +                              elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped",
 +                                       handle->nodeoid, handle->state);
 +                      }
 +                      pgxc_node_free(handle);
 +              }
 +      }
 +
 +      if (IS_PGXC_COORDINATOR)
 +      {
 +              /* Collect Coordinator handles */
 +              for (i = 0; i < NumCoords; i++)
 +              {
 +                      PGXCNodeHandle *handle = &co_handles[i];
 +
 +                      if (handle->sock != NO_SOCKET)
 +                      {
 +                              /*
 +                               * Connections at this point should be completely inactive,
 +                               * otherwise abaandon them. We can not allow not cleaned up
 +                               * connection is returned to pool.
 +                               */
 +                              if (handle->state != DN_CONNECTION_STATE_IDLE ||
 +                                              handle->transaction_status != 'I')
 +                              {
 +                                      destroy = true;
 +                                      elog(DEBUG1, "Connection to Coordinator %d has unexpected state %d and will be dropped",
 +                                                      handle->nodeoid, handle->state);
 +                              }
 +                              pgxc_node_free(handle);
 +                      }
 +              }
 +      }
 +
 +      /* And finally release all the connections on pooler */
 +      PoolManagerReleaseConnections(destroy);
 +
 +      datanode_count = 0;
 +      coord_count = 0;
 +}
 +
 +/*
 + * Ensure that the supplied buffer has enough capacity and if not, it's
 + * extended to an appropriate size.
 + *
 + * currbuf is the currently used buffer of currsize. bytes_needed is the
 + * minimum size required. We shall return the new buffer, if allocated
 + * successfully and set newsize_p to contain the size of the repalloced buffer.
 + * If allocation fails, NULL is returned.
 + *
 + * The function checks for requests beyond MaxAllocSize and throw an error.
 + */
 +static char *
 +ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size_t *newsize_p)
 +{
 +      char       *newbuf;
 +      Size            newsize = (Size) currsize;
 +
 +      if (((Size) bytes_needed) >= MaxAllocSize)
 +              ereport(ERROR,
 +                              (ENOSPC,
 +                               errmsg("out of memory"),
 +                               errdetail("Cannot enlarge buffer containing %ld bytes by %ld more bytes.",
 +                                                 currsize, bytes_needed)));
 +
 +      if (bytes_needed <= newsize)
 +      {
 +              *newsize_p = currsize;
 +              return currbuf;
 +      }
 +
 +      /*
 +       * The current size of the buffer should never be zero (init_pgxc_handle
 +       * guarantees that.
 +       */
 +      Assert(newsize > 0);
 +
 +      /*
 +       * Double the buffer size until we have enough space to hold bytes_needed
 +       */
 +      while (bytes_needed > newsize)
 +              newsize = 2 * newsize;
 +
 +      /*
 +       * Clamp to MaxAllocSize in case we went past it.  Note we are assuming
 +       * here that MaxAllocSize <= INT_MAX/2, else the above loop could
 +       * overflow.  We will still have newsize >= bytes_needed.
 +       */
 +      if (newsize > (int) MaxAllocSize)
 +              newsize = (int) MaxAllocSize;
 +
 +      newbuf = repalloc(currbuf, newsize);
 +      if (newbuf)
 +      {
 +              /* repalloc succeeded, set new size and return the buffer */
 +              *newsize_p = newsize;
 +              return newbuf;
 +      }
 +
 +      /*
 +       * If we fail to double the buffer, try to repalloc a buffer of the given
 +       * size, rounded to the next multiple of 8192 and see if that works.
 +       */
 +      newsize = bytes_needed;
 +      newsize = ((bytes_needed / 8192) + 1) * 8192;
 +
 +      newbuf = repalloc(currbuf, newsize);
 +      if (newbuf)
 +      {
 +              /* repalloc succeeded, set new size and return the buffer */
 +              *newsize_p = newsize;
 +              return newbuf;
 +      }
 +
 +      /* repalloc failed */
 +      return NULL;
 +}
 +
 +/*
 + * Ensure specified amount of data can fit to the incoming buffer and
 + * increase it if necessary
 + */
 +int
 +ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
 +{
 +      size_t newsize;
 +      char *newbuf = ensure_buffer_capacity(handle->inBuffer, handle->inSize,
 +                      bytes_needed, &newsize);
 +      if (newbuf)
 +      {
 +              handle->inBuffer = newbuf;
 +              handle->inSize = newsize;
 +              return 0;
 +      }
 +      return EOF;
 +}
 +
 +/*
 + * Ensure specified amount of data can fit to the outgoing buffer and
 + * increase it if necessary
 + */
 +int
 +ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
 +{
 +      size_t newsize;
 +      char *newbuf = ensure_buffer_capacity(handle->outBuffer, handle->outSize,
 +                      bytes_needed, &newsize);
 +      if (newbuf)
 +      {
 +              handle->outBuffer = newbuf;
 +              handle->outSize = newsize;
 +              return 0;
 +      }
 +      return EOF;
 +}
 +
 +
 +/*
 + * Send specified amount of data from the outgoing buffer over the connection
 + */
 +int
 +send_some(PGXCNodeHandle *handle, int len)
 +{
 +      char       *ptr = handle->outBuffer;
 +      int                     remaining = handle->outEnd;
 +      int                     result = 0;
 +
 +      /* while there's still data to send */
 +      while (len > 0)
 +      {
 +              int                     sent;
 +
 +#ifndef WIN32
 +              sent = send(handle->sock, ptr, len, 0);
 +#else
 +              /*
 +               * Windows can fail on large sends, per KB article Q201213. The failure-point
 +               * appears to be different in different versions of Windows, but 64k should
 +               * always be safe.
 +               */
 +              sent = send(handle->sock, ptr, Min(len, 65536), 0);
 +#endif
 +
 +              if (sent < 0)
 +              {
 +                      /*
 +                       * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's
 +                       * EPIPE or ECONNRESET, assume we've lost the backend connection
 +                       * permanently.
 +                       */
 +                      switch (errno)
 +                      {
 +#ifdef EAGAIN
 +                              case EAGAIN:
 +                                      break;
 +#endif
 +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
 +                              case EWOULDBLOCK:
 +                                      break;
 +#endif
 +                              case EINTR:
 +                                      continue;
 +
 +                              case EPIPE:
 +#ifdef ECONNRESET
 +                              case ECONNRESET:
 +#endif
 +                                      add_error_message(handle, "server closed the connection unexpectedly\n"
 +                                      "\tThis probably means the server terminated abnormally\n"
 +                                                        "\tbefore or while processing the request.\n");
 +
 +                                      /*
 +                                       * We used to close the socket here, but that's a bad idea
 +                                       * since there might be unread data waiting (typically, a
 +                                       * NOTICE message from the backend telling us it's
 +                                       * committing hara-kiri...).  Leave the socket open until
 +                                       * pqReadData finds no more data can be read.  But abandon
 +                                       * attempt to send data.
 +                                       */
 +                                      handle->outEnd = 0;
 +                                      return -1;
 +
 +                              default:
 +                                      add_error_message(handle, "could not send data to server");
 +                                      /* We don't assume it's a fatal error... */
 +                                      handle->outEnd = 0;
 +                                      return -1;
 +                      }
 +              }
 +              else
 +              {
 +                      ptr += sent;
 +                      len -= sent;
 +                      remaining -= sent;
 +              }
 +
 +              if (len > 0)
 +              {
 +                      struct pollfd pool_fd;
 +                      int poll_ret;
 +
 +                      /*
 +                       * Wait for the socket to become ready again to receive more data.
 +                       * For some cases, especially while writing large sums of data
 +                       * during COPY protocol and when the remote node is not capable of
 +                       * handling data at the same speed, we might otherwise go in a
 +                       * useless tight loop, consuming all available local resources
 +                       *
 +                       * Use a small timeout of 1s to avoid infinite wait
 +                       */
 +                      pool_fd.fd = handle->sock;
 +                      pool_fd.events = POLLOUT;
 +
 +                      poll_ret = poll(&pool_fd, 1, 1000);
 +                      if (poll_ret < 0)
 +                      {
 +                              if (errno == EAGAIN || errno == EINTR)
 +                                      continue;
 +                              else
 +                              {
 +                                      add_error_message(handle, "poll failed ");
 +                                      handle->outEnd = 0;
 +                                      return -1;
 +                              }
 +                      }
 +                      else if (poll_ret == 1)
 +                      {
 +                              if (pool_fd.revents & POLLHUP)
 +                              {
 +                                      add_error_message(handle, "remote end disconnected");
 +                                      handle->outEnd = 0;
 +                                      return -1;
 +                              }
 +                      }
 +              }
 +      }
 +
 +      /* shift the remaining contents of the buffer */
 +      if (remaining > 0)
 +              memmove(handle->outBuffer, ptr, remaining);
 +      handle->outEnd = remaining;
 +
 +      return result;
 +}
 +
 +/*
 + * Send PARSE message with specified statement down to the Datanode
 + */
 +int
 +pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
 +                                              const char *query, short num_params, Oid *param_types)
 +{
 +      /* statement name size (allow NULL) */
 +      int                     stmtLen = statement ? strlen(statement) + 1 : 1;
 +      /* size of query string */
 +      int                     strLen = strlen(query) + 1;
 +      char            **paramTypes = (char **)palloc(sizeof(char *) * num_params);
 +      /* total size of parameter type names */
 +      int             paramTypeLen;
 +      /* message length */
 +      int                     msgLen;
 +      int                     cnt_params;
 +#ifdef USE_ASSERT_CHECKING
 +      size_t          old_outEnd = handle->outEnd;
 +#endif
 +
 +      /* if there are parameters, param_types should exist */
 +      Assert(num_params <= 0 || param_types);
 +      /* 2 bytes for number of parameters, preceding the type names */
 +      paramTypeLen = 2;
 +      /* find names of the types of parameters */
 +      for (cnt_params = 0; cnt_params < num_params; cnt_params++)
 +      {
 +              Oid typeoid;
 +
 +              /* Parameters with no types are simply ignored */
 +              if (OidIsValid(param_types[cnt_params]))
 +                      typeoid = param_types[cnt_params];
 +              else
 +                      typeoid = INT4OID;
 +
 +              paramTypes[cnt_params] = format_type_be(typeoid);
 +              paramTypeLen += strlen(paramTypes[cnt_params]) + 1;
 +      }
 +
 +      /* size + stmtLen + strlen + paramTypeLen */
 +      msgLen = 4 + stmtLen + strLen + paramTypeLen;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'P';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +      /* statement name */
 +      if (statement)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
 +              handle->outEnd += stmtLen;
 +      }
 +      else
 +              handle->outBuffer[handle->outEnd++] = '\0';
 +      /* query */
 +      memcpy(handle->outBuffer + handle->outEnd, query, strLen);
 +      handle->outEnd += strLen;
 +      /* parameter types */
 +      Assert(sizeof(num_params) == 2);
 +      *((short *)(handle->outBuffer + handle->outEnd)) = htons(num_params);
 +      handle->outEnd += sizeof(num_params);
 +      /*
 +       * instead of parameter ids we should send parameter names (qualified by
 +       * schema name if required). The OIDs of types can be different on
 +       * Datanodes.
 +       */
 +      for (cnt_params = 0; cnt_params < num_params; cnt_params++)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, paramTypes[cnt_params],
 +                                      strlen(paramTypes[cnt_params]) + 1);
 +              handle->outEnd += strlen(paramTypes[cnt_params]) + 1;
 +              pfree(paramTypes[cnt_params]);
 +      }
 +      pfree(paramTypes);
 +      Assert(old_outEnd + ntohl(msgLen) + 1 == handle->outEnd);
 +
 +      return 0;
 +}
 +
 +/*
 + * Send PLAN message down to the Data node
 + */
 +int
 +pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
 +                                      const char *query, const char *planstr,
 +                                      short num_params, Oid *param_types)
 +{
 +      int                     stmtLen;
 +      int                     queryLen;
 +      int                     planLen;
 +      int             paramTypeLen;
 +      int                     msgLen;
 +      char      **paramTypes = (char **)palloc(sizeof(char *) * num_params);
 +      int                     i;
 +      short           tmp_num_params;
 +
 +      /* Invalid connection state, return error */
 +      if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              return EOF;
 +
 +      /* statement name size (do not allow NULL) */
 +      stmtLen = strlen(statement) + 1;
 +      /* source query size (do not allow NULL) */
 +      queryLen = strlen(query) + 1;
 +      /* query plan size (do not allow NULL) */
 +      planLen = strlen(planstr) + 1;
 +      /* 2 bytes for number of parameters, preceding the type names */
 +      paramTypeLen = 2;
 +      /* find names of the types of parameters */
 +      for (i = 0; i < num_params; i++)
 +      {
 +              paramTypes[i] = format_type_be(param_types[i]);
 +              paramTypeLen += strlen(paramTypes[i]) + 1;
 +      }
 +      /* size + pnameLen + queryLen + parameters */
 +      msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'p';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +      /* statement name */
 +      memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
 +      handle->outEnd += stmtLen;
 +      /* source query */
 +      memcpy(handle->outBuffer + handle->outEnd, query, queryLen);
 +      handle->outEnd += queryLen;
 +      /* query plan */
 +      memcpy(handle->outBuffer + handle->outEnd, planstr, planLen);
 +      handle->outEnd += planLen;
 +      /* parameter types */
 +      tmp_num_params = htons(num_params);
 +      memcpy(handle->outBuffer + handle->outEnd, &tmp_num_params, sizeof(tmp_num_params));
 +      handle->outEnd += sizeof(tmp_num_params);
 +      /*
 +       * instead of parameter ids we should send parameter names (qualified by
 +       * schema name if required). The OIDs of types can be different on
 +       * datanodes.
 +       */
 +      for (i = 0; i < num_params; i++)
 +      {
 +              int plen = strlen(paramTypes[i]) + 1;
 +              memcpy(handle->outBuffer + handle->outEnd, paramTypes[i], plen);
 +              handle->outEnd += plen;
 +              pfree(paramTypes[i]);
 +      }
 +      pfree(paramTypes);
 +
 +      handle->in_extended_query = true;
 +      return 0;
 +}
 +
 +/*
 + * Send BIND message down to the Datanode
 + */
 +int
 +pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
 +                                      const char *statement, int paramlen, char *params)
 +{
 +      int                     pnameLen;
 +      int                     stmtLen;
 +      int             paramCodeLen;
 +      int             paramValueLen;
 +      int             paramOutLen;
 +      int                     msgLen;
 +
 +      /* Invalid connection state, return error */
 +      if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              return EOF;
 +
 +      /* portal name size (allow NULL) */
 +      pnameLen = portal ? strlen(portal) + 1 : 1;
 +      /* statement name size (allow NULL) */
 +      stmtLen = statement ? strlen(statement) + 1 : 1;
 +      /* size of parameter codes array (always empty for now) */
 +      paramCodeLen = 2;
 +      /* size of parameter values array, 2 if no params */
 +      paramValueLen = paramlen ? paramlen : 2;
 +      /* size of output parameter codes array (always empty for now) */
 +      paramOutLen = 2;
 +      /* size + pnameLen + stmtLen + parameters */
 +      msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'B';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +      /* portal name */
 +      if (portal)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, portal, pnameLen);
 +              handle->outEnd += pnameLen;
 +      }
 +      else
 +              handle->outBuffer[handle->outEnd++] = '\0';
 +      /* statement name */
 +      if (statement)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
 +              handle->outEnd += stmtLen;
 +      }
 +      else
 +              handle->outBuffer[handle->outEnd++] = '\0';
 +      /* parameter codes (none) */
 +      handle->outBuffer[handle->outEnd++] = 0;
 +      handle->outBuffer[handle->outEnd++] = 0;
 +      /* parameter values */
 +      if (paramlen)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, params, paramlen);
 +              handle->outEnd += paramlen;
 +      }
 +      else
 +      {
 +              handle->outBuffer[handle->outEnd++] = 0;
 +              handle->outBuffer[handle->outEnd++] = 0;
 +      }
 +      /* output parameter codes (none) */
 +      handle->outBuffer[handle->outEnd++] = 0;
 +      handle->outBuffer[handle->outEnd++] = 0;
 +
 +      handle->in_extended_query = true;
 +      return 0;
 +}
 +
 +
 +/*
 + * Send DESCRIBE message (portal or statement) down to the Datanode
 + */
 +int
 +pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
 +                                              const char *name)
 +{
 +      int                     nameLen;
 +      int                     msgLen;
 +
 +      /* Invalid connection state, return error */
 +      if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              return EOF;
 +
 +      /* statement or portal name size (allow NULL) */
 +      nameLen = name ? strlen(name) + 1 : 1;
 +
 +      /* size + statement/portal + name */
 +      msgLen = 4 + 1 + nameLen;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'D';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +      /* statement/portal flag */
 +      handle->outBuffer[handle->outEnd++] = is_statement ? 'S' : 'P';
 +      /* object name */
 +      if (name)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, name, nameLen);
 +              handle->outEnd += nameLen;
 +      }
 +      else
 +              handle->outBuffer[handle->outEnd++] = '\0';
 +
 +      handle->in_extended_query = true;
 +      return 0;
 +}
 +
 +
 +/*
 + * Send CLOSE message (portal or statement) down to the Datanode
 + */
 +int
 +pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
 +                                       const char *name)
 +{
 +      /* statement or portal name size (allow NULL) */
 +      int                     nameLen = name ? strlen(name) + 1 : 1;
 +
 +      /* size + statement/portal + name */
 +      int                     msgLen = 4 + 1 + nameLen;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'C';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +      /* statement/portal flag */
 +      handle->outBuffer[handle->outEnd++] = is_statement ? 'S' : 'P';
 +      /* object name */
 +      if (name)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, name, nameLen);
 +              handle->outEnd += nameLen;
 +      }
 +      else
 +              handle->outBuffer[handle->outEnd++] = '\0';
 +
 +      handle->in_extended_query = true;
 +      return 0;
 +}
 +
 +/*
 + * Send EXECUTE message down to the Datanode
 + */
 +int
 +pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
 +{
 +      /* portal name size (allow NULL) */
 +      int                     pnameLen = portal ? strlen(portal) + 1 : 1;
 +
 +      /* size + pnameLen + fetchLen */
 +      int                     msgLen = 4 + pnameLen + 4;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'E';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +      /* portal name */
 +      if (portal)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, portal, pnameLen);
 +              handle->outEnd += pnameLen;
 +      }
 +      else
 +              handle->outBuffer[handle->outEnd++] = '\0';
 +
 +      /* fetch */
 +      fetch = htonl(fetch);
 +      memcpy(handle->outBuffer + handle->outEnd, &fetch, 4);
 +      handle->outEnd += 4;
 +
 +      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
 +
 +      handle->in_extended_query = true;
 +      return 0;
 +}
 +
 +
 +/*
 + * Send FLUSH message down to the Datanode
 + */
 +int
 +pgxc_node_send_flush(PGXCNodeHandle * handle)
 +{
 +      /* size */
 +      int                     msgLen = 4;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'H';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +
 +      handle->in_extended_query = true;
 +      return pgxc_node_flush(handle);
 +}
 +
 +
 +/*
 + * Send SYNC message down to the Datanode
 + */
 +int
 +pgxc_node_send_sync(PGXCNodeHandle * handle)
 +{
 +      /* size */
 +      int                     msgLen = 4;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'S';
 +      /* size */
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +
 +      handle->in_extended_query = false;
 +      handle->needSync = false;
 +
 +      return pgxc_node_flush(handle);
 +}
 +
 +
 +/*
 + * Send series of Extended Query protocol messages to the data node
 + */
 +int
 +pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
 +                                                        const char *statement, const char *portal,
 +                                                        int num_params, Oid *param_types,
 +                                                        int paramlen, char *params,
 +                                                        bool send_describe, int fetch_size)
 +{
 +      /* NULL query indicates already prepared statement */
 +      if (query)
 +              if (pgxc_node_send_parse(handle, statement, query, num_params, param_types))
 +                      return EOF;
 +      if (pgxc_node_send_bind(handle, portal, statement, paramlen, params))
 +              return EOF;
 +      if (send_describe)
 +              if (pgxc_node_send_describe(handle, false, portal))
 +                      return EOF;
 +      if (fetch_size >= 0)
 +              if (pgxc_node_send_execute(handle, portal, fetch_size))
 +                      return EOF;
 +      if (pgxc_node_send_flush(handle))
 +              return EOF;
 +
 +      return 0;
 +}
 +
 +
 +/*
 + * This method won't return until connection buffer is empty or error occurs
 + * To ensure all data are on the wire before waiting for response
 + */
 +int
 +pgxc_node_flush(PGXCNodeHandle *handle)
 +{
 +      while (handle->outEnd)
 +      {
 +              if (send_some(handle, handle->outEnd) < 0)
 +              {
 +                      add_error_message(handle, "failed to send data to datanode");
 +
 +                      /*
 +                       * before returning, also update the shared health
 +                       * status field to indicate that this node could be
 +                       * possibly unavailable.
 +                       *
 +                       * Note that this error could be due to a stale handle
 +                       * and it's possible that another backend might have
 +                       * already updated the health status OR the node
 +                       * might have already come back since the last disruption
 +                       */
 +                      PoolPingNodeRecheck(handle->nodeoid);
 +                      return EOF;
 +              }
 +      }
 +      return 0;
 +}
 +
 +/*
 + * This method won't return until network buffer is empty or error occurs
 + * To ensure all data in network buffers is read and wasted
 + */
 +void
 +pgxc_node_flush_read(PGXCNodeHandle *handle)
 +{
 +      bool    is_ready;
 +      int     read_result;
 +
 +      if (handle == NULL)
 +              return;
 +
 +      /*
 +       * Before reading input send Sync to make sure
 +       * we will eventually receive ReadyForQuery
 +       */
 +      pgxc_node_send_sync(handle);
 +      while(true)
 +      {
 +              read_result = pgxc_node_read_data(handle, false);
 +              if (read_result < 0)
 +                      break;
 +
 +              is_ready = is_data_node_ready(handle);
 +              if (is_ready == true)
 +                      break;
 +
 +      }
 +}
 +
 +/*
 + * Send specified statement down to the PGXC node
 + */
 +static int
 +pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
 +              bool rollback)
 +{
 +      int                     strLen;
 +      int                     msgLen;
 +
 +      /*
 +       * Its appropriate to send ROLLBACK commands on a failed connection, but
 +       * for everything else we expect the connection to be in a sane state
 +       */
 +      elog(DEBUG5, "pgxc_node_send_query - handle->state %d, node %s, query %s",
 +                      handle->state, handle->nodename, query);
 +      if ((handle->state != DN_CONNECTION_STATE_IDLE) &&
 +              !(handle->state == DN_CONNECTION_STATE_ERROR_FATAL && rollback))
 +              return EOF;
 +
 +      strLen = strlen(query) + 1;
 +      /* size + strlen */
 +      msgLen = 4 + strLen;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'Q';
 +      msgLen = htonl(msgLen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
 +      handle->outEnd += 4;
 +      memcpy(handle->outBuffer + handle->outEnd, query, strLen);
 +      handle->outEnd += strLen;
 +
 +      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
 +
 +      handle->in_extended_query = false;
 +      return pgxc_node_flush(handle);
 +}
 +
 +int
 +pgxc_node_send_rollback(PGXCNodeHandle *handle, const char *query)
 +{
 +      return pgxc_node_send_query_internal(handle, query, true);
 +}
 +
 +int
 +pgxc_node_send_query(PGXCNodeHandle *handle, const char *query)
 +{
 +      return pgxc_node_send_query_internal(handle, query, false);
 +}
 +
 +
 +/*
 + * Send the GXID down to the PGXC node
 + */
 +int
 +pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
 +{
 +      int                     msglen = 8;
 +
 +      /* Invalid connection state, return error */
 +      if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              return EOF;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'g';
 +      msglen = htonl(msglen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
 +      handle->outEnd += 4;
 +      memcpy(handle->outBuffer + handle->outEnd, &gxid, sizeof
 +                      (TransactionId));
 +      handle->outEnd += sizeof (TransactionId);
 +
 +      return 0;
 +}
 +
 +/*
 + * Send the Command ID down to the PGXC node
 + */
 +int
 +pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
 +{
 +      int                     msglen = CMD_ID_MSG_LEN;
 +      int                     i32;
 +
 +      /* No need to send command ID if its sending flag is not enabled */
 +      if (!IsSendCommandId())
 +              return 0;
 +
 +      /* Invalid connection state, return error */
 +      if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              return EOF;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 'M';
 +      msglen = htonl(msglen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
 +      handle->outEnd += 4;
 +      i32 = htonl(cid);
 +      memcpy(handle->outBuffer + handle->outEnd, &i32, 4);
 +      handle->outEnd += 4;
 +
 +      return 0;
 +}
 +
 +/*
 + * Send the snapshot down to the PGXC node
 + */
 +int
 +pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
 +{
 +      int                     msglen;
 +      int                     nval;
 +      int                     i;
 +
 +      /* Invalid connection state, return error */
 +      if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              return EOF;
 +
 +      /* calculate message length */
 +      msglen = 20;
 +      if (snapshot->xcnt > 0)
 +              msglen += snapshot->xcnt * 4;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +
 +      handle->outBuffer[handle->outEnd++] = 's';
 +      msglen = htonl(msglen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
 +      handle->outEnd += 4;
 +
 +      memcpy(handle->outBuffer + handle->outEnd, &snapshot->xmin, sizeof (TransactionId));
 +      handle->outEnd += sizeof (TransactionId);
 +
 +      memcpy(handle->outBuffer + handle->outEnd, &snapshot->xmax, sizeof (TransactionId));
 +      handle->outEnd += sizeof (TransactionId);
 +
 +      memcpy(handle->outBuffer + handle->outEnd, &RecentGlobalXmin, sizeof (TransactionId));
 +      handle->outEnd += sizeof (TransactionId);
 +
 +      nval = htonl(snapshot->xcnt);
 +      memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
 +      handle->outEnd += 4;
 +
 +      for (i = 0; i < snapshot->xcnt; i++)
 +      {
 +              memcpy(handle->outBuffer + handle->outEnd, &snapshot->xip[i], sizeof
 +                              (TransactionId));
 +              handle->outEnd += sizeof (TransactionId);
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Send the timestamp down to the PGXC node
 + */
 +int
 +pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
 +{
 +      int             msglen = 12; /* 4 bytes for msglen and 8 bytes for timestamp (int64) */
 +      uint32  n32;
 +      int64   i = (int64) timestamp;
 +
 +      /* Invalid connection state, return error */
 +      if (handle->state != DN_CONNECTION_STATE_IDLE)
 +              return EOF;
 +
 +      /* msgType + msgLen */
 +      if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
 +      {
 +              add_error_message(handle, "out of memory");
 +              return EOF;
 +      }
 +      handle->outBuffer[handle->outEnd++] = 't';
 +      msglen = htonl(msglen);
 +      memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
 +      handle->outEnd += 4;
 +
 +      /* High order half first */
 +#ifdef INT64_IS_BUSTED
 +      /* don't try a right shift of 32 on a 32-bit word */
 +      n32 = (i < 0) ? -1 : 0;
 +#else
 +      n32 = (uint32) (i >> 32);
 +#endif
 +      n32 = htonl(n32);
 +      memcpy(handle->outBuffer + handle->outEnd, &n32, 4);
 +      handle->outEnd += 4;
 +
 +      /* Now the low order half */
 +      n32 = (uint32) i;
 +      n32 = htonl(n32);
 +      memcpy(handle->outBuffer + handle->outEnd, &n32, 4);
 +      handle->outEnd += 4;
 +
 +      return 0;
 +}
 +
 +
 +/*
 + * Add another message to the list of errors to be returned back to the client
 + * at the convenient time
 + */
 +void
 +add_error_message(PGXCNodeHandle *handle, const char *message)
 +{
 +      elog(LOG, "Remote node \"%s\", running with pid %d returned an error: %s",
 +                      handle->nodename, handle->backend_pid, message);
 +      handle->transaction_status = 'E';
 +      if (handle->error)
 +      {
 +              /* PGXCTODO append */
 +      }
 +      else
 +              handle->error = pstrdup(message);
 +}
 +
 +static int load_balancer = 0;
 +/*
 + * Get one of the specified nodes to query replicated data source.
 + * If session already owns one or more  of the requested connection,
 + * the function returns existing one to avoid contacting pooler.
 + * Performs basic load balancing.
 + */
 +PGXCNodeHandle *
 +get_any_handle(List *datanodelist)
 +{
 +      ListCell   *lc1;
 +      int                     i, node;
 +
 +      /* sanity check */
 +      Assert(list_length(datanodelist) > 0);
 +
 +      if (HandlesInvalidatePending)
 +              if (DoInvalidateRemoteHandles())
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_QUERY_CANCELED),
 +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
 +
 +      if (HandlesRefreshPending)
 +              if (DoRefreshRemoteHandles())
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_QUERY_CANCELED),
 +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
 +
 +      /* loop through local datanode handles */
 +      for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
 +      {
 +              /* At the moment node is an index in the array, and we may need to wrap it */
 +              if (node >= NumDataNodes)
 +                      node -= NumDataNodes;
 +              /* See if handle is already used */
 +              if (dn_handles[node].sock != NO_SOCKET)
 +              {
 +                      foreach(lc1, datanodelist)
 +                      {
 +                              if (lfirst_int(lc1) == node)
 +                              {
 +                                      /*
 +                                       * The node is in the list of requested nodes,
 +                                       * set load_balancer for next time and return the handle
 +                                       */
 +                                      load_balancer = node + 1;
 +                                      return &dn_handles[node];
 +                              }
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * None of requested nodes is in use, need to get one from the pool.
 +       * Choose one.
 +       */
 +      for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
 +      {
 +              /* At the moment node is an index in the array, and we may need to wrap it */
 +              if (node >= NumDataNodes)
 +                      node -= NumDataNodes;
 +              /* Look only at empty slots, we have already checked existing handles */
 +              if (dn_handles[node].sock == NO_SOCKET)
 +              {
 +                      foreach(lc1, datanodelist)
 +                      {
 +                              if (lfirst_int(lc1) == node)
 +                              {
 +                                      /* The node is requested */
 +                                      List   *allocate = list_make1_int(node);
 +                                      int        *pids;
 +                                      int    *fds = PoolManagerGetConnections(allocate, NIL,
 +                                                      &pids);
 +                                      PGXCNodeHandle          *node_handle;
 +
 +                                      if (!fds)
 +                                      {
 +                                              Assert(pids != NULL);
 +                                              ereport(ERROR,
 +                                                              (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
 +                                                               errmsg("Failed to get pooled connections"),
 +                                                               errhint("This may happen because one or more nodes are "
 +                                                                       "currently unreachable, either because of node or "
 +                                                                       "network failure.\n Its also possible that the target node "
 +                                                                       "may have hit the connection limit or the pooler is "
 +                                                                       "configured with low connections.\n Please check "
 +                                                                       "if all nodes are running fine and also review "
 +                                                                       "max_connections and max_pool_size configuration "
 +                                                                       "parameters")));
 +                                      }
 +                                      node_handle = &dn_handles[node];
 +                                      pgxc_node_init(node_handle, fds[0], true, pids[0]);
 +                                      datanode_count++;
 +
 +                                      elog(DEBUG1, "Established a connection with datanode \"%s\","
 +                                                      "remote backend PID %d, socket fd %d, global session %c",
 +                                                      node_handle->nodename, (int) pids[0], fds[0], 'T');
 +
 +                                      /*
 +                                       * set load_balancer for next time and return the handle
 +                                       */
 +                                      load_balancer = node + 1;
 +                                      return &dn_handles[node];
 +                              }
 +                      }
 +              }
 +      }
 +
 +      /* We should not get here, one of the cases should be met */
 +      Assert(false);
 +      /* Keep compiler quiet */
 +      return NULL;
 +}
 +
 +/*
 + * for specified list return array of PGXCNodeHandles
 + * acquire from pool if needed.
 + * the lenth of returned array is the same as of nodelist
 + * For Datanodes, Special case is empty or NIL nodeList, in this case return all the nodes.
 + * The returned list should be pfree'd when no longer needed.
 + * For Coordinator, do not get a connection if Coordinator list is NIL,
 + * Coordinator fds is returned only if transaction uses a DDL
 + */
 +PGXCNodeAllHandles *
 +get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session)
 +{
 +      PGXCNodeAllHandles      *result;
 +      ListCell                *node_list_item;
 +      List                    *dn_allocate = NIL;
 +      List                    *co_allocate = NIL;
 +      PGXCNodeHandle          *node_handle;
 +
 +      /* index of the result array */
 +      int                     i = 0;
 +
 +      if (HandlesInvalidatePending)
 +              if (DoInvalidateRemoteHandles())
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_QUERY_CANCELED),
 +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
 +
 +      if (HandlesRefreshPending)
 +              if (DoRefreshRemoteHandles())
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_QUERY_CANCELED),
 +                                       errmsg("canceling transaction due to cluster configuration reset by administrator command")));
 +
 +      result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
 +      if (!result)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +
 +      result->primary_handle = NULL;
 +      result->datanode_handles = NULL;
 +      result->coord_handles = NULL;
 +      result->co_conn_count = list_length(coordlist);
 +      result->dn_conn_count = list_length(datanodelist);
 +
 +      /*
 +       * Get Handles for Datanodes
 +       * If node list is empty execute request on current nodes.
 +       * It is also possible that the query has to be launched only on Coordinators.
 +       */
 +      if (!is_coord_only_query)
 +      {
 +              if (list_length(datanodelist) == 0)
 +              {
 +                      /*
 +                       * We do not have to zero the array - on success all items will be set
 +                       * to correct pointers, on error the array will be freed
 +                       */
 +                      result->datanode_handles = (PGXCNodeHandle **)
 +                                                                         palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
 +                      if (!result->datanode_handles)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                               errmsg("out of memory")));
 +                      }
 +
 +                      for (i = 0; i < NumDataNodes; i++)
 +                      {
 +                              node_handle = &dn_handles[i];
 +                              result->datanode_handles[i] = node_handle;
 +                              if (node_handle->sock == NO_SOCKET)
 +                                      dn_allocate = lappend_int(dn_allocate, i);
 +                      }
 +              }
 +              else
 +              {
 +                      /*
 +                       * We do not have to zero the array - on success all items will be set
 +                       * to correct pointers, on error the array will be freed
 +                       */
 +
 +                      result->datanode_handles = (PGXCNodeHandle **)
 +                              palloc(list_length(datanodelist) * sizeof(PGXCNodeHandle *));
 +                      if (!result->datanode_handles)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                               errmsg("out of memory")));
 +                      }
 +
 +                      i = 0;
 +                      foreach(node_list_item, datanodelist)
 +                      {
 +                              int     node = lfirst_int(node_list_item);
 +
 +                              if (node < 0 || node >= NumDataNodes)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                                      errmsg("Invalid Datanode number")));
 +                              }
 +
 +                              node_handle = &dn_handles[node];
 +                              result->datanode_handles[i++] = node_handle;
 +                              if (node_handle->sock == NO_SOCKET)
 +                                      dn_allocate = lappend_int(dn_allocate, node);
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Get Handles for Coordinators
 +       * If node list is empty execute request on current nodes
 +       * There are transactions where the Coordinator list is NULL Ex:COPY
 +       */
 +
 +      if (coordlist)
 +      {
 +              if (list_length(coordlist) == 0)
 +              {
 +                      /*
 +                       * We do not have to zero the array - on success all items will be set
 +                       * to correct pointers, on error the array will be freed
 +                       */
 +                      result->coord_handles = (PGXCNodeHandle **)palloc(NumCoords * sizeof(PGXCNodeHandle *));
 +                      if (!result->coord_handles)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                               errmsg("out of memory")));
 +                      }
 +
 +                      for (i = 0; i < NumCoords; i++)
 +                      {
 +                              node_handle = &co_handles[i];
 +                              result->coord_handles[i] = node_handle;
 +                              if (node_handle->sock == NO_SOCKET)
 +                                      co_allocate = lappend_int(co_allocate, i);
 +                      }
 +              }
 +              else
 +              {
 +                      /*
 +                       * We do not have to zero the array - on success all items will be set
 +                       * to correct pointers, on error the array will be freed
 +                       */
 +                      result->coord_handles = (PGXCNodeHandle **)
 +                                                                      palloc(list_length(coordlist) * sizeof(PGXCNodeHandle *));
 +                      if (!result->coord_handles)
 +                      {
 +                              ereport(ERROR,
 +                                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                               errmsg("out of memory")));
 +                      }
 +
 +                      i = 0;
 +                      /* Some transactions do not need Coordinators, ex: COPY */
 +                      foreach(node_list_item, coordlist)
 +                      {
 +                              int                     node = lfirst_int(node_list_item);
 +
 +                              if (node < 0 || node >= NumCoords)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                                      errmsg("Invalid coordinator number")));
 +                              }
 +
 +                              node_handle = &co_handles[node];
 +
 +                              result->coord_handles[i++] = node_handle;
 +                              if (node_handle->sock == NO_SOCKET)
 +                                      co_allocate = lappend_int(co_allocate, node);
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Pooler can get activated even if list of Coordinator or Datanode is NULL
 +       * If both lists are NIL, we don't need to call Pooler.
 +       */
 +      if (dn_allocate || co_allocate)
 +      {
 +              int     j = 0;
 +              int *pids;
 +              int     *fds = PoolManagerGetConnections(dn_allocate, co_allocate, &pids);
 +
 +              if (!fds)
 +              {
 +                      if (coordlist)
 +                              if (result->coord_handles)
 +                                      pfree(result->coord_handles);
 +                      if (datanodelist)
 +                              if (result->datanode_handles)
 +                                      pfree(result->datanode_handles);
 +
 +                      pfree(result);
 +                      if (dn_allocate)
 +                              list_free(dn_allocate);
 +                      if (co_allocate)
 +                              list_free(co_allocate);
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
 +                                       errmsg("Failed to get pooled connections"),
 +                                       errhint("This may happen because one or more nodes are "
 +                                               "currently unreachable, either because of node or "
 +                                               "network failure.\n Its also possible that the target node "
 +                                               "may have hit the connection limit or the pooler is "
 +                                               "configured with low connections.\n Please check "
 +                                               "if all nodes are running fine and also review "
 +                                               "max_connections and max_pool_size configuration "
 +                                               "parameters")));
 +              }
 +              /* Initialisation for Datanodes */
 +              if (dn_allocate)
 +              {
 +                      foreach(node_list_item, dn_allocate)
 +                      {
 +                              int                     node = lfirst_int(node_list_item);
 +                              int                     fdsock = fds[j];
 +                              int                     be_pid = pids[j++];
 +
 +                              if (node < 0 || node >= NumDataNodes)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                                      errmsg("Invalid Datanode number")));
 +                              }
 +
 +                              node_handle = &dn_handles[node];
 +                              pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
 +                              dn_handles[node] = *node_handle;
 +                              datanode_count++;
 +
 +                              elog(DEBUG1, "Established a connection with datanode \"%s\","
 +                                              "remote backend PID %d, socket fd %d, global session %c",
 +                                              node_handle->nodename, (int) be_pid, fdsock,
 +                                              is_global_session ? 'T' : 'F');
 +                      }
 +              }
 +              /* Initialisation for Coordinators */
 +              if (co_allocate)
 +              {
 +                      foreach(node_list_item, co_allocate)
 +                      {
 +                              int                     node = lfirst_int(node_list_item);
 +                              int                     be_pid = pids[j];
 +                              int                     fdsock = fds[j++];
 +
 +                              if (node < 0 || node >= NumCoords)
 +                              {
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                                      errmsg("Invalid coordinator number")));
 +                              }
 +
 +                              node_handle = &co_handles[node];
 +                              pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
 +                              co_handles[node] = *node_handle;
 +                              coord_count++;
 +
 +                              elog(DEBUG1, "Established a connection with coordinator \"%s\","
 +                                              "remote backend PID %d, socket fd %d, global session %c",
 +                                              node_handle->nodename, (int) be_pid, fdsock,
 +                                              is_global_session ? 'T' : 'F');
 +                      }
 +              }
 +
 +              pfree(fds);
 +
 +              if (co_allocate)
 +                      list_free(co_allocate);
 +              if (dn_allocate)
 +                      list_free(dn_allocate);
 +      }
 +
 +      return result;
 +}
 +
 +PGXCNodeAllHandles *
 +get_current_handles(void)
 +{
 +      PGXCNodeAllHandles *result;
 +      PGXCNodeHandle     *node_handle;
 +      int                                     i;
 +
 +      result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
 +      if (!result)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +
 +      result->primary_handle = NULL;
 +      result->co_conn_count = 0;
 +      result->dn_conn_count = 0;
 +
 +      result->datanode_handles = (PGXCNodeHandle **)
 +                                                         palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
 +      if (!result->datanode_handles)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +
 +      for (i = 0; i < NumDataNodes; i++)
 +      {
 +              node_handle = &dn_handles[i];
 +              if (node_handle->sock != NO_SOCKET)
 +                      result->datanode_handles[result->dn_conn_count++] = node_handle;
 +      }
 +
 +      result->coord_handles = (PGXCNodeHandle **)
 +                                                      palloc(NumCoords * sizeof(PGXCNodeHandle *));
 +      if (!result->coord_handles)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +
 +      for (i = 0; i < NumCoords; i++)
 +      {
 +              node_handle = &co_handles[i];
 +              if (node_handle->sock != NO_SOCKET)
 +                      result->coord_handles[result->co_conn_count++] = node_handle;
 +      }
 +
 +      return result;
 +}
 +
 +/* Free PGXCNodeAllHandles structure */
 +void
 +pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
 +{
 +      if (!pgxc_handles)
 +              return;
 +
 +      if (pgxc_handles->primary_handle)
 +              pfree(pgxc_handles->primary_handle);
 +      if (pgxc_handles->datanode_handles)
 +              pfree(pgxc_handles->datanode_handles);
 +      if (pgxc_handles->coord_handles)
 +              pfree(pgxc_handles->coord_handles);
 +
 +      pfree(pgxc_handles);
 +}
 +
 +/*
 + * PGXCNodeGetNodeId
 + *            Look at the data cached for handles and return node position
 + *            If node type is PGXC_NODE_COORDINATOR look only in coordinator list,
 + *            if node type is PGXC_NODE_DATANODE look only in datanode list,
 + *            if other (assume PGXC_NODE_NODE) search both, in last case return actual
 + *            node type.
 + */
 +int
 +PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
 +{
 +      int i;
 +
 +      /* First check datanodes, they referenced more often */
 +      if (node_type == NULL || *node_type != PGXC_NODE_COORDINATOR)
 +      {
 +              for (i = 0; i < NumDataNodes; i++)
 +              {
 +                      if (dn_handles[i].nodeoid == nodeoid)
 +                      {
 +                              if (node_type)
 +                                      *node_type = PGXC_NODE_DATANODE;
 +                              return i;
 +                      }
 +              }
 +      }
 +      /* Then check coordinators */
 +      if (node_type == NULL || *node_type != PGXC_NODE_DATANODE)
 +      {
 +              for (i = 0; i < NumCoords; i++)
 +              {
 +                      if (co_handles[i].nodeoid == nodeoid)
 +                      {
 +                              if (node_type)
 +                                      *node_type = PGXC_NODE_COORDINATOR;
 +                              return i;
 +                      }
 +              }
 +      }
 +      /* Not found, have caller handling it */
 +      if (node_type)
 +              *node_type = PGXC_NODE_NONE;
 +      return -1;
 +}
 +
 +/*
 + * PGXCNodeGetNodeOid
 + *            Look at the data cached for handles and return node Oid
 + */
 +Oid
 +PGXCNodeGetNodeOid(int nodeid, char node_type)
 +{
 +      PGXCNodeHandle *handles;
 +
 +      switch (node_type)
 +      {
 +              case PGXC_NODE_COORDINATOR:
 +                      handles = co_handles;
 +                      break;
 +              case PGXC_NODE_DATANODE:
 +                      handles = dn_handles;
 +                      break;
 +              default:
 +                      /* Should not happen */
 +                      Assert(0);
 +                      return InvalidOid;
 +      }
 +
 +      return handles[nodeid].nodeoid;
 +}
 +
 +/*
 + * pgxc_node_str
 + *
 + * get the name of the node
 + */
 +Datum
 +pgxc_node_str(PG_FUNCTION_ARGS)
 +{
++      PG_RETURN_TEXT_P(cstring_to_text(PGXCNodeName));
 +}
 +
 +/*
 + * PGXCNodeGetNodeIdFromName
 + *            Return node position in handles array
 + */
 +int
 +PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
 +{
 +      char *nm;
 +      Oid nodeoid;
 +
 +      if (node_name == NULL)
 +      {
 +              if (node_type)
 +                      *node_type = PGXC_NODE_NONE;
 +              return -1;
 +      }
 +
 +      nm = str_tolower(node_name, strlen(node_name), DEFAULT_COLLATION_OID);
 +
 +      nodeoid = get_pgxc_nodeoid(nm);
 +      pfree(nm);
 +      if (!OidIsValid(nodeoid))
 +      {
 +              if (node_type)
 +                      *node_type = PGXC_NODE_NONE;
 +              return -1;
 +      }
 +
 +      return PGXCNodeGetNodeId(nodeoid, node_type);
 +}
 +
 +static List *
 +paramlist_delete_param(List *param_list, const char *name)
 +{
 +         ListCell   *cur_item;
 +         ListCell   *prev_item;
 +
 +         prev_item = NULL;
 +         cur_item = list_head(param_list);
 +
 +         while (cur_item != NULL)
 +         {
 +                         ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
 +
 +                         if (strcmp(NameStr(entry->name), name) == 0)
 +                         {
 +                                         /* cur_item must be removed */
 +                                         param_list = list_delete_cell(param_list, cur_item, prev_item);
 +                                         pfree(entry);
 +                                         if (prev_item)
 +                                                         cur_item = lnext(prev_item);
 +                                         else
 +                                                         cur_item = list_head(param_list);
 +                         }
 +                         else
 +                         {
 +                                         prev_item = cur_item;
 +                                         cur_item = lnext(prev_item);
 +                         }
 +         }
 +
 +         return param_list;
 +}
 +
 +/*
 + * Remember new value of a session or transaction parameter, and set same
 + * values on newly connected remote nodes.
 + */
 +void
 +PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
 +{
 +      List *param_list;
 +      MemoryContext oldcontext;
 +
 +      /* Get the target hash table and invalidate command string */
 +      if (local)
 +      {
 +              param_list = local_param_list;
 +              if (local_params)
 +                      resetStringInfo(local_params);
 +              oldcontext = MemoryContextSwitchTo(TopTransactionContext);
 +      }
 +      else
 +      {
 +              param_list = session_param_list;
 +              if (session_params)
 +                      resetStringInfo(session_params);
 +              oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 +      }
 +
 +      param_list = paramlist_delete_param(param_list, name);
 +      if (value)
 +      {
 +              ParamEntry *entry;
 +              entry = (ParamEntry *) palloc(sizeof (ParamEntry));
 +              strlcpy((char *) (&entry->name), name, NAMEDATALEN);
 +              strlcpy((char *) (&entry->value), value, NAMEDATALEN);
 +              entry->flags = flags;
 +
 +              param_list = lappend(param_list, entry);
 +      }
 +
 +      /*
 +       * Special case for
 +       *      RESET SESSION AUTHORIZATION
 +       *      SET SESSION AUTHORIZATION TO DEFAULT
 +       *
 +       * We must also forget any SET ROLE commands since RESET SESSION
 +       * AUTHORIZATION also resets current role to session default
 +       */
 +      if ((strcmp(name, "session_authorization") == 0) && (value == NULL))
 +              param_list = paramlist_delete_param(param_list, "role");
 +
 +      if (local)
 +              local_param_list = param_list;
 +      else
 +              session_param_list = param_list;
 +
 +      MemoryContextSwitchTo(oldcontext);
 +}
 +
 +
 +/*
 + * Forget all parameter values set either for transaction or both transaction
 + * and session.
 + */
 +void
 +PGXCNodeResetParams(bool only_local)
 +{
 +      if (!only_local && session_param_list)
 +      {
 +              /* need to explicitly pfree session stuff, it is in TopMemoryContext */
 +              list_free_deep(session_param_list);
 +              session_param_list = NIL;
 +              if (session_params)
 +              {
 +                      pfree(session_params->data);
 +                      pfree(session_params);
 +                      session_params = NULL;
 +              }
 +      }
 +      /*
 +       * no need to explicitly destroy the local_param_list and local_params,
 +       * it will gone with the transaction memory context.
 +       */
 +      local_param_list = NIL;
 +      local_params = NULL;
 +}
 +
 +static void
 +get_set_command(List *param_list, StringInfo command, bool local)
 +{
 +      ListCell                   *lc;
 +
 +      if (param_list == NIL)
 +              return;
 +
 +      foreach (lc, param_list)
 +      {
 +              ParamEntry *entry = (ParamEntry *) lfirst(lc);
 +              char *value = NameStr(entry->value);
 +
 +              if (strlen(value) == 0)
 +                      value = "''";
 +
 +              value = quote_guc_value(value, entry->flags);
 +
 +              appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "",
 +                       NameStr(entry->name), value);
 +      }
 +}
 +
 +
 +/*
 + * Returns SET commands needed to initialize remote session.
 + * The command may already be biult and valid, return it right away if the case.
 + * Otherwise build it up.
 + * To support Distributed Session machinery coordinator should generate and
 + * send a distributed session identifier to remote nodes. Generate it here.
 + */
 +char *
 +PGXCNodeGetSessionParamStr(void)
 +{
 +      /*
 +       * If no session parameters are set and that is a coordinator we need to set
 +       * global_session anyway, even if there were no other parameters.
 +       * We do not want this string to disappear, so create it in the
 +       * TopMemoryContext. However if we add first session parameter we will need
 +       * to free the buffer and recreate it in the same context as the hash table
 +       * to avoid memory leakage.
 +       */
 +      if (session_params == NULL)
 +      {
 +              MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 +              session_params = makeStringInfo();
 +              MemoryContextSwitchTo(oldcontext);
 +      }
 +
 +      /* If the paramstr invalid build it up */
 +      if (session_params->len == 0)
 +      {
 +              if (IS_PGXC_COORDINATOR)
 +                      appendStringInfo(session_params, "SET global_session TO %s_%d;",
 +                                                       PGXCNodeName, MyProcPid);
 +              get_set_command(session_param_list, session_params, false);
 +              appendStringInfo(session_params, "SET parentPGXCPid TO %d;",
 +                                                       MyProcPid);
 +      }
 +      return session_params->len == 0 ? NULL : session_params->data;
 +}
 +
 +
 +/*
 + * Returns SET commands needed to initialize transaction on a remote session.
 + * The command may already be biult and valid, return it right away if the case.
 + * Otherwise build it up.
 + */
 +char *
 +PGXCNodeGetTransactionParamStr(void)
 +{
 +      /* If no local parameters defined there is nothing to return */
 +      if (local_param_list == NIL)
 +              return NULL;
 +
 +      /*
 +       * If the paramstr invalid build it up.
 +       */
 +      if (local_params == NULL)
 +      {
 +              MemoryContext oldcontext = MemoryContextSwitchTo(TopTransactionContext);
 +              local_params = makeStringInfo();
 +              MemoryContextSwitchTo(oldcontext);
 +      }
 +      /*
 +       * If parameter string exists it is valid, it is truncated when parameters
 +       * are modified.
 +       */
 +      if (local_params->len == 0)
 +      {
 +              get_set_command(local_param_list, local_params, true);
 +      }
 +      return local_params->len == 0 ? NULL : local_params->data;
 +}
 +
 +
 +/*
 + * Send down specified query, read and discard all responses until ReadyForQuery
 + */
 +void
 +pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
 +{
 +      pgxc_node_send_query(handle, set_query);
 +      /*
 +       * Now read responses until ReadyForQuery.
 +       * XXX We may need to handle possible errors here.
 +       */
 +      for (;;)
 +      {
 +              char    msgtype;
 +              int     msglen;
 +              char   *msg;
 +              /*
 +               * If we are in the process of shutting down, we
 +               * may be rolling back, and the buffer may contain other messages.
 +               * We want to avoid a procarray exception
 +               * as well as an error stack overflow.
 +               */
 +              if (proc_exit_inprogress)
 +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
 +
 +              /* don't read from from the connection if there is a fatal error */
 +              if (handle->state == DN_CONNECTION_STATE_ERROR_FATAL)
 +                      break;
 +
 +              /* No data available, read more */
 +              if (!HAS_MESSAGE_BUFFERED(handle))
 +              {
 +                      pgxc_node_receive(1, &handle, NULL);
 +                      continue;
 +              }
 +              msgtype = get_message(handle, &msglen, &msg);
 +
 +              /*
 +               * Ignore any response except ErrorResponse and ReadyForQuery
 +               */
 +
 +              if (msgtype == 'E')     /* ErrorResponse */
 +              {
 +                      handle->error = pstrdup(msg);
 +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
 +                      break;
 +              }
 +
 +              if (msgtype == 'Z') /* ReadyForQuery */
 +              {
 +                      handle->transaction_status = msg[0];
 +                      PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_IDLE);
 +                      handle->combiner = NULL;
 +                      break;
 +              }
 +      }
 +}
 +
 +
 +void
 +RequestInvalidateRemoteHandles(void)
 +{
 +      HandlesInvalidatePending = true;
 +}
 +
 +void
 +RequestRefreshRemoteHandles(void)
 +{
 +      HandlesRefreshPending = true;
 +}
 +
 +bool
 +PoolerMessagesPending(void)
 +{
 +      if (HandlesRefreshPending)
 +              return true;
 +
 +      return false;
 +}
 +
 +/*
 + * For all handles, mark as they are not in use and discard pending input/output
 + */
 +static bool
 +DoInvalidateRemoteHandles(void)
 +{
 +      int                     i;
 +      PGXCNodeHandle *handle;
 +      bool                    result = false;
 +
 +      HandlesInvalidatePending = false;
 +      HandlesRefreshPending = false;
 +
 +      for (i = 0; i < NumCoords; i++)
 +      {
 +              handle = &co_handles[i];
 +              if (handle->sock != NO_SOCKET)
 +                      result = true;
 +              handle->sock = NO_SOCKET;
 +              handle->inStart = handle->inEnd = handle->inCursor = 0;
 +              handle->outEnd = 0;
 +      }
 +      for (i = 0; i < NumDataNodes; i++)
 +      {
 +              handle = &dn_handles[i];
 +              if (handle->sock != NO_SOCKET)
 +                      result = true;
 +              handle->sock = NO_SOCKET;
 +              handle->inStart = handle->inEnd = handle->inCursor = 0;
 +              handle->outEnd = 0;
 +      }
 +
 +      InitMultinodeExecutor(true);
 +
 +      return result;
 +}
 +
 +/*
 + * Diff handles using shmem, and remove ALTERed handles
 + */
 +static bool
 +DoRefreshRemoteHandles(void)
 +{
 +      List                    *altered = NIL, *deleted = NIL, *added = NIL;
 +      Oid                             *coOids, *dnOids;
 +      int                             numCoords, numDNodes, total_nodes;
 +      bool                    res = true;
 +
 +      HandlesRefreshPending = false;
 +
 +      PgxcNodeGetOids(&coOids, &dnOids, &numCoords, &numDNodes, false);
 +
 +      total_nodes = numCoords + numDNodes;
 +      if (total_nodes > 0)
 +      {
 +              int             i;
 +              List   *shmoids = NIL;
 +              Oid        *allOids = (Oid *)palloc(total_nodes * sizeof(Oid));
 +
 +              /* build array with Oids of all nodes (coordinators first) */
 +              memcpy(allOids, coOids, numCoords * sizeof(Oid));
 +              memcpy(allOids + numCoords, dnOids, numDNodes * sizeof(Oid));
 +
 +              LWLockAcquire(NodeTableLock, LW_SHARED);
 +
 +              for (i = 0; i < total_nodes; i++)
 +              {
 +                      NodeDefinition  *nodeDef;
 +                      PGXCNodeHandle  *handle;
 +
 +                      int nid;
 +                      Oid nodeoid;
 +                      char ntype = PGXC_NODE_NONE;
 +
 +                      nodeoid = allOids[i];
 +                      shmoids = lappend_oid(shmoids, nodeoid);
 +
 +                      nodeDef = PgxcNodeGetDefinition(nodeoid);
 +                      /*
 +                       * identify an entry with this nodeoid. If found
 +                       * compare the name/host/port entries. If the name is
 +                       * same and other info is different, it's an ALTER.
 +                       * If the local entry does not exist in the shmem, it's
 +                       * a DELETE. If the entry from shmem does not exist
 +                       * locally, it's an ADDITION
 +                       */
 +                      nid = PGXCNodeGetNodeId(nodeoid, &ntype);
 +
 +                      if (nid == -1)
 +                      {
 +                              /* a new node has been added to the shmem */
 +                              added = lappend_oid(added, nodeoid);
 +                              elog(LOG, "Node added: name (%s) host (%s) port (%d)",
 +                                       NameStr(nodeDef->nodename), NameStr(nodeDef->nodehost),
 +                                       nodeDef->nodeport);
 +                      }
 +                      else
 +                      {
 +                              if (ntype == PGXC_NODE_COORDINATOR)
 +                                      handle = &co_handles[nid];
 +                              else if (ntype == PGXC_NODE_DATANODE)
 +                                      handle = &dn_handles[nid];
 +                              else
 +                                      elog(ERROR, "Node with non-existent node type!");
 +
 +                              /*
 +                               * compare name, host, port to see if this node
 +                               * has been ALTERed
 +                               */
 +                              if (strncmp(handle->nodename, NameStr(nodeDef->nodename), NAMEDATALEN) != 0 ||
 +                                      strncmp(handle->nodehost, NameStr(nodeDef->nodehost), NAMEDATALEN) != 0 ||
 +                                      handle->nodeport != nodeDef->nodeport)
 +                              {
 +                                      elog(LOG, "Node altered: old name (%s) old host (%s) old port (%d)"
 +                                                      " new name (%s) new host (%s) new port (%d)",
 +                                               handle->nodename, handle->nodehost, handle->nodeport,
 +                                               NameStr(nodeDef->nodename), NameStr(nodeDef->nodehost),
 +                                               nodeDef->nodeport);
 +                                      altered = lappend_oid(altered, nodeoid);
 +                              }
 +                              /* else do nothing */
 +                      }
 +                      pfree(nodeDef);
 +              }
 +
 +              /*
 +               * Any entry in backend area but not in shmem means that it has
 +               * been deleted
 +               */
 +              for (i = 0; i < NumCoords; i++)
 +              {
 +                      PGXCNodeHandle  *handle = &co_handles[i];
 +                      Oid nodeoid = handle->nodeoid;
 +
 +                      if (!list_member_oid(shmoids, nodeoid))
 +                      {
 +                              deleted = lappend_oid(deleted, nodeoid);
 +                              elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
 +                                       handle->nodename, handle->nodehost, handle->nodeport);
 +                      }
 +              }
 +
 +              for (i = 0; i < NumDataNodes; i++)
 +              {
 +                      PGXCNodeHandle  *handle = &dn_handles[i];
 +                      Oid nodeoid = handle->nodeoid;
 +
 +                      if (!list_member_oid(shmoids, nodeoid))
 +                      {
 +                              deleted = lappend_oid(deleted, nodeoid);
 +                              elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
 +                                       handle->nodename, handle->nodehost, handle->nodeport);
 +                      }
 +              }
 +
 +              LWLockRelease(NodeTableLock);
 +
 +              /* Release palloc'ed memory */
 +              pfree(coOids);
 +              pfree(dnOids);
 +              pfree(allOids);
 +              list_free(shmoids);
 +      }
 +
 +      if (deleted != NIL || added != NIL)
 +      {
 +              elog(LOG, "Nodes added/deleted. Reload needed!");
 +              res = false;
 +      }
 +
 +      if (altered == NIL)
 +      {
 +              elog(LOG, "No nodes altered. Returning");
 +              res = true;
 +      }
 +      else
 +              PgxcNodeRefreshBackendHandlesShmem(altered);
 +
 +      list_free(altered);
 +      list_free(added);
 +      list_free(deleted);
 +
 +      return res;
 +}
 +
 +void
 +PGXCNodeSetConnectionState(PGXCNodeHandle *handle, DNConnectionState new_state)
 +{
 +      elog(DEBUG5, "Changing connection state for node %s, old state %d, "
 +                      "new state %d", handle->nodename, handle->state, new_state);
 +      handle->state = new_state;
 +}
 +
 +/*
 + * Do a "Diff" of backend NODE metadata and the one present in catalog
 + *
 + * We do this in order to identify if we should do a destructive
 + * cleanup or just invalidation of some specific handles
 + */
 +bool
 +PgxcNodeDiffBackendHandles(List **nodes_alter,
 +                         List **nodes_delete, List **nodes_add)
 +{
 +      Relation rel;
 +      HeapScanDesc scan;
 +      HeapTuple   tuple;
 +      int     i;
 +      List *altered = NIL, *added = NIL, *deleted = NIL;
 +      List *catoids = NIL;
 +      PGXCNodeHandle *handle;
 +      Oid     nodeoid;
 +      bool res = true;
 +
 +      LWLockAcquire(NodeTableLock, LW_SHARED);
 +
 +      rel = heap_open(PgxcNodeRelationId, AccessShareLock);
 +      scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
 +      while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 +      {
 +              Form_pgxc_node  nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +              int nid;
 +              Oid nodeoid;
 +              char ntype = PGXC_NODE_NONE;
 +
 +              nodeoid = HeapTupleGetOid(tuple);
 +              catoids = lappend_oid(catoids, nodeoid);
 +
 +              /*
 +               * identify an entry with this nodeoid. If found
 +               * compare the name/host/port entries. If the name is
 +               * same and other info is different, it's an ALTER.
 +               * If the local entry does not exist in the catalog, it's
 +               * a DELETE. If the entry from catalog does not exist
 +               * locally, it's an ADDITION
 +               */
 +              nid = PGXCNodeGetNodeId(nodeoid, &ntype);
 +
 +              if (nid == -1)
 +              {
 +                      /* a new node has been added to the catalog */
 +                      added = lappend_oid(added, nodeoid);
 +                      elog(LOG, "Node added: name (%s) host (%s) port (%d)",
 +                               NameStr(nodeForm->node_name), NameStr(nodeForm->node_host),
 +                               nodeForm->node_port);
 +              }
 +              else
 +              {
 +                      if (ntype == PGXC_NODE_COORDINATOR)
 +                              handle = &co_handles[nid];
 +                      else if (ntype == PGXC_NODE_DATANODE)
 +                              handle = &dn_handles[nid];
 +                      else
 +                              elog(ERROR, "Node with non-existent node type!");
 +
 +                      /*
 +                       * compare name, host, port to see if this node
 +                       * has been ALTERed
 +                       */
 +                      if (strncmp(handle->nodename, NameStr(nodeForm->node_name), NAMEDATALEN)
 +                              != 0 ||
 +                              strncmp(handle->nodehost, NameStr(nodeForm->node_host), NAMEDATALEN)
 +                              != 0 ||
 +                              handle->nodeport != nodeForm->node_port)
 +                      {
 +                              elog(LOG, "Node altered: old name (%s) old host (%s) old port (%d)"
 +                                              " new name (%s) new host (%s) new port (%d)",
 +                                       handle->nodename, handle->nodehost, handle->nodeport,
 +                                       NameStr(nodeForm->node_name), NameStr(nodeForm->node_host),
 +                                       nodeForm->node_port);
 +                              /*
 +                               * If this node itself is being altered, then we need to
 +                               * resort to a reload. Check so..
 +                               */
 +                              if (pg_strcasecmp(PGXCNodeName,
 +                                                                NameStr(nodeForm->node_name)) == 0)
 +                              {
 +                                      res = false;
 +                              }
 +                              altered = lappend_oid(altered, nodeoid);
 +                      }
 +                      /* else do nothing */
 +              }
 +      }
 +      heap_endscan(scan);
 +
 +      /*
 +       * Any entry in backend area but not in catalog means that it has
 +       * been deleted
 +       */
 +      for (i = 0; i < NumCoords; i++)
 +      {
 +              handle = &co_handles[i];
 +              nodeoid = handle->nodeoid;
 +              if (!list_member_oid(catoids, nodeoid))
 +              {
 +                      deleted = lappend_oid(deleted, nodeoid);
 +                      elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
 +                               handle->nodename, handle->nodehost, handle->nodeport);
 +              }
 +      }
 +      for (i = 0; i < NumDataNodes; i++)
 +      {
 +              handle = &dn_handles[i];
 +              nodeoid = handle->nodeoid;
 +              if (!list_member_oid(catoids, nodeoid))
 +              {
 +                      deleted = lappend_oid(deleted, nodeoid);
 +                      elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
 +                               handle->nodename, handle->nodehost, handle->nodeport);
 +              }
 +      }
 +      heap_close(rel, AccessShareLock);
 +      LWLockRelease(NodeTableLock);
 +
 +      if (nodes_alter)
 +              *nodes_alter = altered;
 +      if (nodes_delete)
 +              *nodes_delete = deleted;
 +      if (nodes_add)
 +              *nodes_add = added;
 +
 +      if (catoids)
 +              list_free(catoids);
 +
 +      return res;
 +}
 +
 +/*
 + * Refresh specific backend handles associated with
 + * nodes in the "nodes_alter" list below
 + *
 + * The handles are refreshed using shared memory
 + */
 +void
 +PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter)
 +{
 +      ListCell *lc;
 +      Oid nodeoid;
 +      int nid;
 +      PGXCNodeHandle *handle = NULL;
 +
 +      foreach(lc, nodes_alter)
 +      {
 +              char ntype = PGXC_NODE_NONE;
 +              NodeDefinition *nodedef;
 +
 +              nodeoid = lfirst_oid(lc);
 +              nid = PGXCNodeGetNodeId(nodeoid, &ntype);
 +
 +              if (nid == -1)
 +                      elog(ERROR, "Looks like node metadata changed again");
 +              else
 +              {
 +                      if (ntype == PGXC_NODE_COORDINATOR)
 +                              handle = &co_handles[nid];
 +                      else if (ntype == PGXC_NODE_DATANODE)
 +                              handle = &dn_handles[nid];
 +                      else
 +                              elog(ERROR, "Node with non-existent node type!");
 +              }
 +
 +              /*
 +               * Update the local backend handle data with data from catalog
 +               * Free the handle first..
 +               */
 +              pgxc_node_free(handle);
 +              elog(LOG, "Backend (%u), Node (%s) updated locally",
 +                       MyBackendId, handle->nodename);
 +              nodedef = PgxcNodeGetDefinition(nodeoid);
 +              strncpy(handle->nodename, NameStr(nodedef->nodename), NAMEDATALEN);
 +              strncpy(handle->nodehost, NameStr(nodedef->nodehost), NAMEDATALEN);
 +              handle->nodeport = nodedef->nodeport;
 +              pfree(nodedef);
 +      }
 +      return;
 +}
 +
 +void
 +HandlePoolerMessages(void)
 +{
 +      if (HandlesRefreshPending)
 +      {
 +              DoRefreshRemoteHandles();
 +
 +              elog(LOG, "Backend (%u), doing handles refresh",
 +                       MyBackendId);
 +      }
 +      return;
 +}
index 336101419ec9fd4a5ae1a885609482d9491574e9,0000000000000000000000000000000000000000..140907d8728407d66b649459264b99483f4acbc6
mode 100644,000000..100644
--- /dev/null
@@@ -1,3043 -1,0 +1,3046 @@@
 +/*-------------------------------------------------------------------------
 + *
 + * poolmgr.c
 + *
 + *      Connection pool manager handles connections to Datanodes
 + *
 + * The pooler runs as a separate process and is forked off from a
 + * Coordinator postmaster. If the Coordinator needs a connection from a
 + * Datanode, it asks for one from the pooler, which maintains separate
 + * pools for each Datanode. A group of connections can be requested in
 + * a single request, and the pooler returns a list of file descriptors
 + * to use for the connections.
 + *
 + * Note the current implementation does not yet shrink the pool over time
 + * as connections are idle.  Also, it does not queue requests; if a
 + * connection is unavailable, it will simply fail. This should be implemented
 + * one day, although there is a chance for deadlocks. For now, limiting
 + * connections should be done between the application and Coordinator.
 + * Still, this is useful to avoid having to re-establish connections to the
 + * Datanodes all the time for multiple Coordinator backend sessions.
 + *
 + * The term "agent" here refers to a session manager, one for each backend
 + * Coordinator connection to the pooler. It will contain a list of connections
 + * allocated to a session, at most one per Datanode.
 + *
 + *
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
 + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
 + *
 + * IDENTIFICATION
 + *      $$
 + *
 + *-------------------------------------------------------------------------
 + */
 +#include <signal.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <sys/types.h>
 +#include <sys/socket.h>
 +#include <poll.h>
++#include <math.h>
 +
 +#include "postgres.h"
 +
 +#include "access/xact.h"
 +#include "catalog/pgxc_node.h"
 +#include "commands/dbcommands.h"
 +#include "libpq/pqsignal.h"
 +#include "miscadmin.h"
 +#include "nodes/nodes.h"
 +#include "utils/builtins.h"
 +#include "utils/guc.h"
 +#include "utils/memutils.h"
 +#include "utils/lsyscache.h"
 +#include "utils/resowner.h"
 +#include "lib/stringinfo.h"
 +#include "libpq/pqformat.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/pause.h"
 +#include "pgxc/pgxc.h"
 +#include "pgxc/poolmgr.h"
 +#include "pgxc/poolutils.h"
 +#include "postmaster/postmaster.h"            /* For UnixSocketDir */
 +#include "storage/procarray.h"
++#include "utils/varlena.h"
 +
 +#include "../interfaces/libpq/libpq-fe.h"
 +#include "../interfaces/libpq/libpq-int.h"
 +
++
 +/* Configuration options */
 +int                   PoolConnKeepAlive = 600;
 +int                   PoolMaintenanceTimeout = 30;
 +int                   MaxPoolSize = 100;
 +int                   PoolerPort = 6667;
 +
 +bool                  PersistentConnections = false;
 +
 +/* Flag to tell if we are Postgres-XC pooler process */
 +static bool am_pgxc_pooler = false;
 +
 +/* Connection information cached */
 +typedef struct
 +{
 +      Oid     nodeoid;
 +      char    *host;
 +      int     port;
 +} PGXCNodeConnectionInfo;
 +
 +/* Handle to the pool manager (Session's side) */
 +typedef struct
 +{
 +      /* communication channel */
 +      PoolPort        port;
 +} PoolHandle;
 +
 +/* The root memory context */
 +static MemoryContext PoolerMemoryContext = NULL;
 +/*
 + * Allocations of core objects: Datanode connections, upper level structures,
 + * connection strings, etc.
 + */
 +static MemoryContext PoolerCoreContext = NULL;
 +/*
 + * Memory to store Agents
 + */
 +static MemoryContext PoolerAgentContext = NULL;
 +
 +/* Pool to all the databases (linked list) */
 +static DatabasePool *databasePools = NULL;
 +
 +/* PoolAgents and the poll array*/
 +static int    agentCount = 0;
 +static PoolAgent **poolAgents;
 +
 +static PoolHandle *poolHandle = NULL;
 +
 +static int    is_pool_locked = false;
 +static int    server_fd = -1;
 +
 +static int    node_info_check(PoolAgent *agent);
 +static void agent_init(PoolAgent *agent, const char *database, const char *user_name,
 +                         const char *pgoptions);
 +static void agent_destroy(PoolAgent *agent);
 +static void agent_create(void);
 +static void agent_handle_input(PoolAgent *agent, StringInfo s);
 +static DatabasePool *create_database_pool(const char *database, const char *user_name, const char *pgoptions);
 +static void insert_database_pool(DatabasePool *pool);
 +static int    destroy_database_pool(const char *database, const char *user_name);
 +static void reload_database_pools(PoolAgent *agent);
 +static int refresh_database_pools(PoolAgent *agent);
 +static bool remove_all_agent_references(Oid nodeoid);
 +static DatabasePool *find_database_pool(const char *database, const char *user_name, const char *pgoptions);
 +static DatabasePool *remove_database_pool(const char *database, const char *user_name);
 +static int *agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 +              List *coordlist, int **connectionpids);
 +static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist);
 +static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node);
 +static void agent_release_connections(PoolAgent *agent, bool force_destroy);
 +static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 +                                                         Oid node, bool force_destroy);
 +static void destroy_slot(PGXCNodePoolSlot *slot);
 +static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
 +static void destroy_node_pool(PGXCNodePool *node_pool);
 +static void PoolerLoop(void);
 +static int clean_connection(List *node_discard,
 +                                                      const char *database,
 +                                                      const char *user_name);
 +static int *abort_pids(int *count,
 +                                         int pid,
 +                                         const char *database,
 +                                         const char *user_name);
 +static char *build_node_conn_str(Oid node, DatabasePool *dbPool);
 +/* Signal handlers */
 +static void pooler_die(SIGNAL_ARGS);
 +static void pooler_quickdie(SIGNAL_ARGS);
 +static void PoolManagerConnect(const char *database, const char *user_name,
 +              const char *pgoptions);
 +static void pooler_sighup(SIGNAL_ARGS);
 +static bool shrink_pool(DatabasePool *pool);
 +static void pools_maintenance(void);
 +static void TryPingUnhealthyNode(Oid nodeoid);
 +
 +/*
 + * Flags set by interrupt handlers for later service in the main loop.
 + */
 +static volatile sig_atomic_t got_SIGHUP = false;
 +static volatile sig_atomic_t shutdown_requested = false;
 +
 +void
 +PGXCPoolerProcessIam(void)
 +{
 +      am_pgxc_pooler = true;
 +}
 +
 +bool
 +IsPGXCPoolerProcess(void)
 +{
 +    return am_pgxc_pooler;
 +}
 +
 +/*
 + * Initialize internal structures
 + */
 +int
 +PoolManagerInit()
 +{
 +      elog(DEBUG1, "Pooler process is started: %d", getpid());
 +
 +      /*
 +       * Set up memory contexts for the pooler objects
 +       */
 +      PoolerMemoryContext = AllocSetContextCreate(TopMemoryContext,
 +                                                                                              "PoolerMemoryContext",
 +                                                                                              ALLOCSET_DEFAULT_MINSIZE,
 +                                                                                              ALLOCSET_DEFAULT_INITSIZE,
 +                                                                                              ALLOCSET_DEFAULT_MAXSIZE);
 +      PoolerCoreContext = AllocSetContextCreate(PoolerMemoryContext,
 +                                                                                        "PoolerCoreContext",
 +                                                                                        ALLOCSET_DEFAULT_MINSIZE,
 +                                                                                        ALLOCSET_DEFAULT_INITSIZE,
 +                                                                                        ALLOCSET_DEFAULT_MAXSIZE);
 +      PoolerAgentContext = AllocSetContextCreate(PoolerMemoryContext,
 +                                                                                         "PoolerAgentContext",
 +                                                                                         ALLOCSET_DEFAULT_MINSIZE,
 +                                                                                         ALLOCSET_DEFAULT_INITSIZE,
 +                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
 +
 +      ForgetLockFiles();      
 +
 +      /*
 +       * Properly accept or ignore signals the postmaster might send us
 +       */
 +      pqsignal(SIGINT, pooler_die);
 +      pqsignal(SIGTERM, pooler_die);
 +      pqsignal(SIGQUIT, pooler_quickdie);
 +      pqsignal(SIGHUP, pooler_sighup);
 +      /* TODO other signal handlers */
 +
 +      /* We allow SIGQUIT (quickdie) at all times */
 +      sigdelset(&BlockSig, SIGQUIT);
 +
 +      /*
 +       * Unblock signals (they were blocked when the postmaster forked us)
 +       */
 +      PG_SETMASK(&UnBlockSig);
 +
 +      /* Allocate pooler structures in the Pooler context */
 +      MemoryContextSwitchTo(PoolerMemoryContext);
 +
 +      poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
 +      if (poolAgents == NULL)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory while initializing pool agents")));
 +      }
 +
 +      PoolerLoop();
 +      return 0;
 +}
 +
 +
 +/*
 + * Check connection info consistency with system catalogs
 + */
 +static int
 +node_info_check(PoolAgent *agent)
 +{
 +      DatabasePool   *dbPool = databasePools;
 +      List               *checked = NIL;
 +      int                     res = POOL_CHECK_SUCCESS;
 +      Oid                        *coOids;
 +      Oid                        *dnOids;
 +      int                             numCo;
 +      int                             numDn;
 +
 +      /*
 +       * First check if agent's node information matches to current content of the
 +       * shared memory table.
 +       */
 +      PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
 +
 +      if (agent->num_coord_connections != numCo ||
 +                      agent->num_dn_connections != numDn ||
 +                      memcmp(agent->coord_conn_oids, coOids, numCo * sizeof(Oid)) ||
 +                      memcmp(agent->dn_conn_oids, dnOids, numDn * sizeof(Oid)))
 +              res = POOL_CHECK_FAILED;
 +
 +      /* Release palloc'ed memory */
 +      pfree(coOids);
 +      pfree(dnOids);
 +
 +      /*
 +       * Iterate over all dbnode pools and check if connection strings
 +       * are matching node definitions.
 +       */
 +      while (res == POOL_CHECK_SUCCESS && dbPool)
 +      {
 +              HASH_SEQ_STATUS hseq_status;
 +              PGXCNodePool   *nodePool;
 +
 +              hash_seq_init(&hseq_status, dbPool->nodePools);
 +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
 +              {
 +                      char               *connstr_chk;
 +
 +                      /* No need to check same Datanode twice */
 +                      if (list_member_oid(checked, nodePool->nodeoid))
 +                              continue;
 +                      checked = lappend_oid(checked, nodePool->nodeoid);
 +
 +                      connstr_chk = build_node_conn_str(nodePool->nodeoid, dbPool);
 +                      if (connstr_chk == NULL)
 +                      {
 +                              /* Problem of constructing connection string */
 +                              hash_seq_term(&hseq_status);
 +                              res = POOL_CHECK_FAILED;
 +                              break;
 +                      }
 +                      /* return error if there is difference */
 +                      if (strcmp(connstr_chk, nodePool->connstr))
 +                      {
 +                              pfree(connstr_chk);
 +                              hash_seq_term(&hseq_status);
 +                              res = POOL_CHECK_FAILED;
 +                              break;
 +                      }
 +
 +                      pfree(connstr_chk);
 +              }
 +              dbPool = dbPool->next;
 +      }
 +      list_free(checked);
 +      return res;
 +}
 +
 +/*
 + * Destroy internal structures
 + */
 +int
 +PoolManagerDestroy(void)
 +{
 +      int                     status = 0;
 +
 +      if (PoolerMemoryContext)
 +      {
 +              MemoryContextDelete(PoolerMemoryContext);
 +              PoolerMemoryContext = NULL;
 +      }
 +
 +      return status;
 +}
 +
 +/*
 + * Connect to the pooler process
 + */
 +static void
 +GetPoolManagerHandle(void)
 +{
 +      PoolHandle *handle;
 +      int                     fdsock = -1;
 +
 +      if (poolHandle)
 +              /* already connected */
 +              return;
 +
 +#ifdef HAVE_UNIX_SOCKETS
 +      if (Unix_socket_directories)
 +      {
 +              char       *rawstring;
 +              List       *elemlist;
 +              ListCell   *l;
 +              int                     success = 0;
 +
 +              /* Need a modifiable copy of Unix_socket_directories */
 +              rawstring = pstrdup(Unix_socket_directories);
 +
 +              /* Parse string into list of directories */
 +              if (!SplitDirectoriesString(rawstring, ',', &elemlist))
 +              {
 +                      /* syntax error in list */
 +                      ereport(FATAL,
 +                                      (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 +                                       errmsg("invalid list syntax in parameter \"%s\"",
 +                                                      "unix_socket_directories")));
 +              }
 +
 +              foreach(l, elemlist)
 +              {
 +                      char       *socketdir = (char *) lfirst(l);
 +                      int                     saved_errno;
 +
 +                      /* Connect to the pooler */
 +                      fdsock = pool_connect(PoolerPort, socketdir);
 +                      if (fdsock < 0)
 +                      {
 +                              saved_errno = errno;
 +                              ereport(WARNING,
 +                                              (errmsg("could not create Unix-domain socket in directory \"%s\", errno: %d",
 +                                                              socketdir, saved_errno)));
 +                      }
 +                      else
 +                      {
 +                              success++;
 +                              break;
 +                      }
 +              }
 +
 +              if (!success && elemlist != NIL)
 +                      ereport(ERROR,
 +                                      (errmsg("failed to connect to pool manager: %m")));
 +
 +              list_free_deep(elemlist);
 +              pfree(rawstring);
 +      }
 +#endif
 +
 +      /*
 +       * Actual connection errors should be reported by the block above,
 +       * but perhaps we haven't actually executed it - either because
 +       * the Unix_socket_directories is not set, or because there's no
 +       * support for UNIX_SOCKETS. Just bail out in that case.
 +       */
 +      if (fdsock < 0)
 +              ereport(ERROR,
 +                              (errmsg("failed to connect to pool manager: %m")));
 +
 +      /*
 +       * Allocate handle
 +       *
 +       * XXX we may change malloc here to palloc but first ensure
 +       * the CurrentMemoryContext is properly set.
 +       * The handle allocated just before new session is forked off and
 +       * inherited by the session process. It should remain valid for all
 +       * the session lifetime.
 +       */
 +      handle = (PoolHandle *) malloc(sizeof(PoolHandle));
 +      if (!handle)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +
 +      handle->port.fdsock = fdsock;
 +      handle->port.RecvLength = 0;
 +      handle->port.RecvPointer = 0;
 +      handle->port.SendPointer = 0;
 +
 +      poolHandle = handle;
 +}
 +
 +/*
 + * Create agent
 + */
 +static void
 +agent_create(void)
 +{
 +      MemoryContext oldcontext;
 +      int                     new_fd;
 +      PoolAgent  *agent;
 +
 +      new_fd = accept(server_fd, NULL, NULL);
 +      if (new_fd < 0)
 +      {
 +              int                     saved_errno = errno;
 +
 +              ereport(LOG,
 +                              (errcode(ERRCODE_CONNECTION_FAILURE),
 +                               errmsg("pool manager failed to accept connection: %m")));
 +              errno = saved_errno;
 +              return;
 +      }
 +
 +      oldcontext = MemoryContextSwitchTo(PoolerAgentContext);
 +
 +      /* Allocate agent */
 +      agent = (PoolAgent *) palloc(sizeof(PoolAgent));
 +      if (!agent)
 +      {
 +              close(new_fd);
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +              return;
 +      }
 +
 +      agent->port.fdsock = new_fd;
 +      agent->port.RecvLength = 0;
 +      agent->port.RecvPointer = 0;
 +      agent->port.SendPointer = 0;
 +      agent->pool = NULL;
 +      agent->mcxt = AllocSetContextCreate(CurrentMemoryContext,
 +                                                                              "Agent",
 +                                                                              ALLOCSET_DEFAULT_MINSIZE,
 +                                                                              ALLOCSET_DEFAULT_INITSIZE,
 +                                                                              ALLOCSET_DEFAULT_MAXSIZE);
 +      agent->num_dn_connections = 0;
 +      agent->num_coord_connections = 0;
 +      agent->dn_conn_oids = NULL;
 +      agent->coord_conn_oids = NULL;
 +      agent->dn_connections = NULL;
 +      agent->coord_connections = NULL;
 +      agent->pid = 0;
 +
 +      /* Append new agent to the list */
 +      poolAgents[agentCount++] = agent;
 +
 +      MemoryContextSwitchTo(oldcontext);
 +}
 +
 +
 +/*
 + * session_options
 + * Returns the pgoptions string generated using a particular
 + * list of parameters that are required to be propagated to Datanodes.
 + * These parameters then become default values for the pooler sessions.
 + * For e.g., a psql user sets PGDATESTYLE. This value should be set
 + * as the default connection parameter in the pooler session that is
 + * connected to the Datanodes. There are various parameters which need to
 + * be analysed individually to determine whether these should be set on
 + * Datanodes.
 + *
 + * Note: These parameters values are the default values of the particular
 + * Coordinator backend session, and not the new values set by SET command.
 + *
 + */
 +
 +char *session_options(void)
 +{
 +      int                              i;
 +      char                    *pgoptions[] = {"DateStyle", "timezone", "geqo", "intervalstyle", "lc_monetary"};
 +      StringInfoData   options;
 +      List                    *value_list;
 +      ListCell                *l;
 +
 +      initStringInfo(&options);
 +
 +      for (i = 0; i < sizeof(pgoptions)/sizeof(char*); i++)
 +      {
 +              const char              *value;
 +
 +              appendStringInfo(&options, " -c %s=", pgoptions[i]);
 +
 +              value = GetConfigOptionResetString(pgoptions[i]);
 +
 +              /* lc_monetary does not accept lower case values */
 +              if (strcmp(pgoptions[i], "lc_monetary") == 0)
 +              {
 +                      appendStringInfoString(&options, value);
 +                      continue;
 +              }
 +
 +              SplitIdentifierString(strdup(value), ',', &value_list);
 +              foreach(l, value_list)
 +              {
 +                      char *value = (char *) lfirst(l);
 +                      appendStringInfoString(&options, value);
 +                      if (lnext(l))
 +                              appendStringInfoChar(&options, ',');
 +              }
 +      }
 +
 +      return options.data;
 +}
 +
 +
 +/*
 + * Associate session with specified database and respective connection pool
 + * Invoked from Session process
 + */
 +static void
 +PoolManagerConnect(const char *database, const char *user_name,
 +              const char *pgoptions)
 +{
 +      int     n32;
 +      char    msgtype = 'c';
 +      int     unamelen = strlen(user_name);
 +      int     dbnamelen = strlen(database);
 +      int             pgoptionslen = strlen(pgoptions);
 +      char    atchar = ' ';
 +
 +      /* Connect to the pooler process if not yet connected */
 +      GetPoolManagerHandle();
 +      if (poolHandle == NULL)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("failed to connect to the pooler process")));
 +
 +      elog(DEBUG1, "Connecting to PoolManager (user_name %s, database %s, "
 +                      "pgoptions %s", user_name, database, pgoptions);
 +
 +      /*
 +       * Special handling for db_user_namespace=on
 +       * We need to handle per-db users and global users. The per-db users will
 +       * arrive with @dbname and global users just as username. Handle both of
 +       * them appropriately
 +       */
 +      if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0)
 +      {
 +              if (strchr(user_name, '@') != NULL)
 +              {
 +                      Assert(unamelen > dbnamelen + 1);
 +                      unamelen -= (dbnamelen + 1);
 +              }
 +              else
 +              {
 +                      atchar = '@';
 +                      unamelen++;
 +              }
 +      }
 +
 +      /* Message type */
 +      pool_putbytes(&poolHandle->port, &msgtype, 1);
 +
 +      /* Message length */
 +      n32 = htonl(dbnamelen + unamelen + pgoptionslen + 23);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* PID number */
 +      n32 = htonl(MyProcPid);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Length of Database string */
 +      n32 = htonl(dbnamelen + 1);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send database name followed by \0 terminator */
 +      pool_putbytes(&poolHandle->port, database, dbnamelen);
 +      pool_putbytes(&poolHandle->port, "\0", 1);
 +
 +      /* Length of user name string */
 +      n32 = htonl(unamelen + 1);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send user name followed by \0 terminator */
 +      /* Send the '@' char if needed. Already accounted for in len */
 +      if (atchar == '@')
 +      {
 +              pool_putbytes(&poolHandle->port, user_name, unamelen - 1);
 +              pool_putbytes(&poolHandle->port, "@", 1);
 +      }
 +      else
 +              pool_putbytes(&poolHandle->port, user_name, unamelen);
 +      pool_putbytes(&poolHandle->port, "\0", 1);
 +
 +      /* Length of pgoptions string */
 +      n32 = htonl(pgoptionslen + 1);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send pgoptions followed by \0 terminator */
 +      pool_putbytes(&poolHandle->port, pgoptions, pgoptionslen);
 +      pool_putbytes(&poolHandle->port, "\0", 1);
 +      pool_flush(&poolHandle->port);
 +}
 +
 +/*
 + * Reconnect to pool manager
 + * It simply does a disconnection and a reconnection.
 + */
 +void
 +PoolManagerReconnect(void)
 +{
 +      elog(DEBUG1, "Reconnecting to PoolManager");
 +
 +      /* Connected, disconnect */
 +      if (poolHandle)
 +              PoolManagerDisconnect();
 +
 +      PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName(),
 +                      session_options());
 +}
 +
 +/*
 + * Lock/unlock pool manager
 + * During locking, the only operations not permitted are abort, connection and
 + * connection obtention.
 + */
 +void
 +PoolManagerLock(bool is_lock)
 +{
 +      char msgtype = 'o';
 +      int n32;
 +      int msglen = 8;
 +      if (poolHandle == NULL)
 +              PoolManagerConnect(get_database_name(MyDatabaseId),
 +                                                 GetClusterUserName(), "");
 +
 +      elog(DEBUG1, "Locking PoolManager");
 +
 +      /* Message type */
 +      pool_putbytes(&poolHandle->port, &msgtype, 1);
 +
 +      /* Message length */
 +      n32 = htonl(msglen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Lock information */
 +      n32 = htonl((int) is_lock);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +      pool_flush(&poolHandle->port);
 +}
 +
 +/*
 + * Init PoolAgent
 + */
 +static void
 +agent_init(PoolAgent *agent, const char *database, const char *user_name,
 +           const char *pgoptions)
 +{
 +      MemoryContext oldcontext;
 +
 +      Assert(agent);
 +      Assert(database);
 +      Assert(user_name);
 +
 +      /* disconnect if we are still connected */
 +      if (agent->pool)
 +              agent_release_connections(agent, false);
 +
 +      oldcontext = MemoryContextSwitchTo(agent->mcxt);
 +
 +      /* Get needed info and allocate memory */
 +      PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
 +                                      &agent->num_coord_connections, &agent->num_dn_connections, false);
 +
 +      agent->coord_connections = (PGXCNodePoolSlot **)
 +                      palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
 +      agent->dn_connections = (PGXCNodePoolSlot **)
 +                      palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
 +      /* find database */
 +      agent->pool = find_database_pool(database, user_name, pgoptions);
 +
 +      /* create if not found */
 +      if (agent->pool == NULL)
 +              agent->pool = create_database_pool(database, user_name, pgoptions);
 +
 +      MemoryContextSwitchTo(oldcontext);
 +
 +      return;
 +}
 +
 +/*
 + * Destroy PoolAgent
 + */
 +static void
 +agent_destroy(PoolAgent *agent)
 +{
 +      int     i;
 +
 +      Assert(agent);
 +
 +      close(Socket(agent->port));
 +
 +      /* Discard connections if any remaining */
 +      if (agent->pool)
 +      {
 +              /*
 +               * If session is disconnecting while there are active connections
 +               * we can not know if they clean or not, so force destroy them
 +               */
 +              agent_release_connections(agent, true);
 +      }
 +
 +      /* find agent in the list */
 +      for (i = 0; i < agentCount; i++)
 +      {
 +              if (poolAgents[i] == agent)
 +              {
 +                      /* Free memory. All connection slots are NULL at this point */
 +                      MemoryContextDelete(agent->mcxt);
 +
 +                      pfree(agent);
 +                      /* shrink the list and move last agent into the freed slot */
 +                      if (i < --agentCount)
 +                              poolAgents[i] = poolAgents[agentCount];
 +                      /* only one match is expected so exit */
 +                      break;
 +              }
 +      }
 +}
 +
 +/*
 + * Ping an UNHEALTHY node and if it succeeds, update SHARED node
 + * information
 + */
 +static void
 +TryPingUnhealthyNode(Oid nodeoid)
 +{
 +      int status;
 +      NodeDefinition *nodeDef;
 +      char connstr[MAXPGPATH * 2 + 256];
 +
 +      nodeDef = PgxcNodeGetDefinition(nodeoid);
 +      if (nodeDef == NULL)
 +      {
 +              /* No such definition, node dropped? */
 +              elog(DEBUG1, "Could not find node (%u) definition,"
 +                       " skipping health check", nodeoid);
 +              return;
 +      }
 +      if (nodeDef->nodeishealthy)
 +      {
 +              /* hmm, can this happen? */
 +              elog(DEBUG1, "node (%u) healthy!"
 +                       " skipping health check", nodeoid);
 +              return;
 +      }
 +
 +      elog(LOG, "node (%s:%u) down! Trying ping",
 +               NameStr(nodeDef->nodename), nodeoid);
 +      sprintf(connstr,
 +                      "host=%s port=%d", NameStr(nodeDef->nodehost),
 +                      nodeDef->nodeport);
 +      status = PGXCNodePing(connstr);
 +      if (status != 0)
 +      {
 +              pfree(nodeDef);
 +              return;
 +      }
 +
 +      elog(DEBUG1, "Node (%s) back online!", NameStr(nodeDef->nodename));
 +      if (!PgxcNodeUpdateHealth(nodeoid, true))
 +              elog(WARNING, "Could not update health status of node (%s)",
 +                       NameStr(nodeDef->nodename));
 +      else
 +              elog(LOG, "Health map updated to reflect HEALTHY node (%s)",
 +                       NameStr(nodeDef->nodename));
 +      pfree(nodeDef);
 +
 +      return;
 +}
 +
 +/*
 + * Check if a node is indeed down and if it is update its UNHEALTHY
 + * status
 + */
 +void
 +PoolPingNodeRecheck(Oid nodeoid)
 +{
 +      int status;
 +      NodeDefinition *nodeDef;
 +      char connstr[MAXPGPATH * 2 + 256];
 +      bool    healthy;
 +
 +      nodeDef = PgxcNodeGetDefinition(nodeoid);
 +      if (nodeDef == NULL)
 +      {
 +              /* No such definition, node dropped? */
 +              elog(DEBUG1, "Could not find node (%u) definition,"
 +                       " skipping health check", nodeoid);
 +              return;
 +      }
 +
 +      sprintf(connstr,
 +                      "host=%s port=%d", NameStr(nodeDef->nodehost),
 +                      nodeDef->nodeport);
 +      status = PGXCNodePing(connstr);
 +      healthy = (status == 0);
 +
 +      /* if no change in health bit, return */
 +      if (healthy == nodeDef->nodeishealthy)
 +      {
 +              pfree(nodeDef);
 +              return;
 +      }
 +
 +      if (!PgxcNodeUpdateHealth(nodeoid, healthy))
 +              elog(WARNING, "Could not update health status of node (%s)",
 +                       NameStr(nodeDef->nodename));
 +      else
 +              elog(LOG, "Health map updated to reflect (%s) node (%s)",
 +                       healthy ? "HEALTHY" : "UNHEALTHY", NameStr(nodeDef->nodename));
 +      pfree(nodeDef);
 +
 +      return;
 +}
 +
 +/*
 + * Ping UNHEALTHY nodes as part of the maintenance window
 + */
 +void
 +PoolPingNodes()
 +{
 +      Oid                             coOids[MaxCoords];
 +      Oid                             dnOids[MaxDataNodes];
 +      bool                    coHealthMap[MaxCoords];
 +      bool                    dnHealthMap[MaxDataNodes];
 +      int                             numCo;
 +      int                             numDn;
 +      int                             i;
 +
 +      PgxcNodeGetHealthMap(coOids, dnOids, &numCo, &numDn,
 +                                               coHealthMap, dnHealthMap);
 +
 +      /*
 +       * Find unhealthy datanodes and try to re-ping them
 +       */
 +      for (i = 0; i < numDn; i++)
 +      {
 +              if (!dnHealthMap[i])
 +              {
 +                      Oid      nodeoid = dnOids[i];
 +                      TryPingUnhealthyNode(nodeoid);
 +              }
 +      }
 +      /*
 +       * Find unhealthy coordinators and try to re-ping them
 +       */
 +      for (i = 0; i < numCo; i++)
 +      {
 +              if (!coHealthMap[i])
 +              {
 +                      Oid      nodeoid = coOids[i];
 +                      TryPingUnhealthyNode(nodeoid);
 +              }
 +      }
 +}
 +
 +/*
 + * Release handle to pool manager
 + */
 +void
 +PoolManagerDisconnect(void)
 +{
 +      if (!poolHandle)
 +              return; /* not even connected */
 +
 +      pool_putmessage(&poolHandle->port, 'd', NULL, 0);
 +      pool_flush(&poolHandle->port);
 +
 +      close(Socket(poolHandle->port));
 +      free(poolHandle);
 +      poolHandle = NULL;
 +}
 +
 +
 +/*
 + * Get pooled connections
 + */
 +int *
 +PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
 +{
 +      int                     i;
 +      ListCell   *nodelist_item;
 +      int                *fds;
 +      int                     totlen = list_length(datanodelist) + list_length(coordlist);
 +      int                     nodes[totlen + 2];
 +
 +      if (poolHandle == NULL)
 +              PoolManagerConnect(get_database_name(MyDatabaseId),
 +                                                 GetClusterUserName(), session_options());
 +
 +      /*
 +       * Prepare end send message to pool manager.
 +       * First with Datanode list.
 +       * This list can be NULL for a query that does not need
 +       * Datanode Connections (Sequence DDLs)
 +       */
 +      nodes[0] = htonl(list_length(datanodelist));
 +      i = 1;
 +      if (list_length(datanodelist) != 0)
 +      {
 +              foreach(nodelist_item, datanodelist)
 +              {
 +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
 +              }
 +      }
 +      /* Then with Coordinator list (can be nul) */
 +      nodes[i++] = htonl(list_length(coordlist));
 +      if (list_length(coordlist) != 0)
 +      {
 +              foreach(nodelist_item, coordlist)
 +              {
 +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
 +              }
 +      }
 +
 +      pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2));
 +      pool_flush(&poolHandle->port);
 +
 +      /* Receive response */
 +      fds = (int *) palloc(sizeof(int) * totlen);
 +      if (fds == NULL)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +      if (pool_recvfds(&poolHandle->port, fds, totlen))
 +      {
 +              pfree(fds);
 +              fds = NULL;
 +      }
 +
 +      if (pool_recvpids(&poolHandle->port, pids) != totlen)
 +      {
 +              pfree(*pids);
 +              *pids = NULL;
 +              return NULL;
 +      }
 +
 +      return fds;
 +}
 +
 +/*
 + * Abort active transactions using pooler.
 + * Take a lock forbidding access to Pooler for new transactions.
 + */
 +int
 +PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
 +{
 +      int             num_proc_ids = 0;
 +      int             n32, msglen;
 +      char            msgtype = 'a';
 +      int             dblen = dbname ? strlen(dbname) + 1 : 0;
 +      int             userlen = username ? strlen(username) + 1 : 0;
 +
 +      /*
 +       * New connection may be established to clean connections to
 +       * specified nodes and databases.
 +       */
 +      if (poolHandle == NULL)
 +              PoolManagerConnect(get_database_name(MyDatabaseId),
 +                                                 GetClusterUserName(), session_options());
 +
 +      /* Message type */
 +      pool_putbytes(&poolHandle->port, &msgtype, 1);
 +
 +      /* Message length */
 +      msglen = dblen + userlen + 12;
 +      n32 = htonl(msglen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Length of Database string */
 +      n32 = htonl(dblen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send database name, followed by \0 terminator if necessary */
 +      if (dbname)
 +              pool_putbytes(&poolHandle->port, dbname, dblen);
 +
 +      /* Length of Username string */
 +      n32 = htonl(userlen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send user name, followed by \0 terminator if necessary */
 +      if (username)
 +              pool_putbytes(&poolHandle->port, username, userlen);
 +
 +      pool_flush(&poolHandle->port);
 +
 +      /* Then Get back Pids from Pooler */
 +      num_proc_ids = pool_recvpids(&poolHandle->port, proc_pids);
 +
 +      return num_proc_ids;
 +}
 +
 +
 +/*
 + * Clean up Pooled connections
 + */
 +void
 +PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username)
 +{
 +      int                     totlen = list_length(datanodelist) + list_length(coordlist);
 +      int                     nodes[totlen + 2];
 +      ListCell                *nodelist_item;
 +      int                     i, n32, msglen;
 +      char                    msgtype = 'f';
 +      int                     userlen = username ? strlen(username) + 1 : 0;
 +      int                     dblen = dbname ? strlen(dbname) + 1 : 0;
 +
 +      /*
 +       * New connection may be established to clean connections to
 +       * specified nodes and databases.
 +       */
 +      if (poolHandle == NULL)
 +              PoolManagerConnect(get_database_name(MyDatabaseId),
 +                                                 GetClusterUserName(), session_options());
 +
 +      nodes[0] = htonl(list_length(datanodelist));
 +      i = 1;
 +      if (list_length(datanodelist) != 0)
 +      {
 +              foreach(nodelist_item, datanodelist)
 +              {
 +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
 +              }
 +      }
 +      /* Then with Coordinator list (can be nul) */
 +      nodes[i++] = htonl(list_length(coordlist));
 +      if (list_length(coordlist) != 0)
 +      {
 +              foreach(nodelist_item, coordlist)
 +              {
 +                      nodes[i++] = htonl(lfirst_int(nodelist_item));
 +              }
 +      }
 +
 +      /* Message type */
 +      pool_putbytes(&poolHandle->port, &msgtype, 1);
 +
 +      /* Message length */
 +      msglen = sizeof(int) * (totlen + 2) + dblen + userlen + 12;
 +      n32 = htonl(msglen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send list of nodes */
 +      pool_putbytes(&poolHandle->port, (char *) nodes, sizeof(int) * (totlen + 2));
 +
 +      /* Length of Database string */
 +      n32 = htonl(dblen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send database name, followed by \0 terminator if necessary */
 +      if (dbname)
 +              pool_putbytes(&poolHandle->port, dbname, dblen);
 +
 +      /* Length of Username string */
 +      n32 = htonl(userlen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Send user name, followed by \0 terminator if necessary */
 +      if (username)
 +              pool_putbytes(&poolHandle->port, username, userlen);
 +
 +      pool_flush(&poolHandle->port);
 +
 +      /* Receive result message */
 +      if (pool_recvres(&poolHandle->port) != CLEAN_CONNECTION_COMPLETED)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Clean connections not completed")));
 +}
 +
 +
 +/*
 + * Check connection information consistency cached in pooler with catalog information
 + */
 +bool
 +PoolManagerCheckConnectionInfo(void)
 +{
 +      int res;
 +
 +      /*
 +       * New connection may be established to clean connections to
 +       * specified nodes and databases.
 +       */
 +      if (poolHandle == NULL)
 +              PoolManagerConnect(get_database_name(MyDatabaseId),
 +                                                 GetClusterUserName(), session_options());
 +      PgxcNodeListAndCount();
 +      pool_putmessage(&poolHandle->port, 'q', NULL, 0);
 +      pool_flush(&poolHandle->port);
 +
 +      res = pool_recvres(&poolHandle->port);
 +
 +      if (res == POOL_CHECK_SUCCESS)
 +              return true;
 +
 +      return false;
 +}
 +
 +
 +/*
 + * Reload connection data in pooler and drop all the existing connections of pooler
 + */
 +void
 +PoolManagerReloadConnectionInfo(void)
 +{
 +      Assert(poolHandle);
 +      PgxcNodeListAndCount();
 +      pool_putmessage(&poolHandle->port, 'p', NULL, 0);
 +      pool_flush(&poolHandle->port);
 +}
 +
 +/*
 + * Refresh connection data in pooler and drop connections for those nodes
 + * that have changed. Thus, this operation is less destructive as compared
 + * to PoolManagerReloadConnectionInfo and should typically be called when
 + * NODE ALTER has been performed
 + */
 +int
 +PoolManagerRefreshConnectionInfo(void)
 +{
 +      int res;
 +
 +      Assert(poolHandle);
 +      PgxcNodeListAndCount();
 +      pool_putmessage(&poolHandle->port, 'R', NULL, 0);
 +      pool_flush(&poolHandle->port);
 +
 +      res = pool_recvres(&poolHandle->port);
 +
 +      if (res == POOL_CHECK_SUCCESS)
 +              return true;
 +
 +      return false;
 +}
 +
 +static void
 +handle_abort(PoolAgent * agent, StringInfo s)
 +{
 +      int             len;
 +      int        *pids;
 +      const char *database = NULL;
 +      const char *user_name = NULL;
 +
 +      pool_getmessage(&agent->port, s, 0);
 +      len = pq_getmsgint(s, 4);
 +      if (len > 0)
 +              database = pq_getmsgbytes(s, len);
 +
 +      len = pq_getmsgint(s, 4);
 +      if (len > 0)
 +              user_name = pq_getmsgbytes(s, len);
 +
 +      pq_getmsgend(s);
 +
 +      pids = abort_pids(&len, agent->pid, database, user_name);
 +
 +      pool_sendpids(&agent->port, pids, len);
 +      if (pids)
 +              pfree(pids);
 +}
 +
 +static void
 +handle_connect(PoolAgent * agent, StringInfo s)
 +{
 +      int     len;
 +      const char *database = NULL;
 +      const char *user_name = NULL;
 +      const char *pgoptions = NULL;
 +
 +      pool_getmessage(&agent->port, s, 0);
 +      agent->pid = pq_getmsgint(s, 4);
 +
 +      len = pq_getmsgint(s, 4);
 +      database = pq_getmsgbytes(s, len);
 +
 +      len = pq_getmsgint(s, 4);
 +      user_name = pq_getmsgbytes(s, len);
 +
 +      len = pq_getmsgint(s, 4);
 +      pgoptions = pq_getmsgbytes(s, len);
 +
 +      /*
 +       * Coordinator pool is not initialized.
 +       * With that it would be impossible to create a Database by default.
 +       */
 +      agent_init(agent, database, user_name, pgoptions);
 +      pq_getmsgend(s);
 +}
 +
 +static void
 +handle_clean_connection(PoolAgent * agent, StringInfo s)
 +{
 +      int i, len, res;
 +      int     datanodecount, coordcount;
 +      const char *database = NULL;
 +      const char *user_name = NULL;
 +      List       *nodelist = NIL;
 +
 +      pool_getmessage(&agent->port, s, 0);
 +
 +      /* It is possible to clean up only datanode connections */
 +      datanodecount = pq_getmsgint(s, 4);
 +      for (i = 0; i < datanodecount; i++)
 +      {
 +              /* Translate index to Oid */
 +              int index = pq_getmsgint(s, 4);
 +              Oid node = agent->dn_conn_oids[index];
 +              nodelist = lappend_oid(nodelist, node);
 +      }
 +
 +      /* It is possible to clean up only coordinator connections */
 +      coordcount = pq_getmsgint(s, 4);
 +      for (i = 0; i < coordcount; i++)
 +      {
 +              /* Translate index to Oid */
 +              int index = pq_getmsgint(s, 4);
 +              Oid node = agent->coord_conn_oids[index];
 +              nodelist = lappend_oid(nodelist, node);
 +      }
 +
 +      len = pq_getmsgint(s, 4);
 +      if (len > 0)
 +              database = pq_getmsgbytes(s, len);
 +
 +      len = pq_getmsgint(s, 4);
 +      if (len > 0)
 +              user_name = pq_getmsgbytes(s, len);
 +
 +      pq_getmsgend(s);
 +
 +      /* Clean up connections here */
 +      res = clean_connection(nodelist, database, user_name);
 +
 +      list_free(nodelist);
 +
 +      /* Send success result */
 +      pool_sendres(&agent->port, res);
 +}
 +
 +static void
 +handle_get_connections(PoolAgent * agent, StringInfo s)
 +{
 +      int             i;
 +      int        *fds, *pids = NULL;
 +      int             datanodecount, coordcount;
 +      List   *datanodelist = NIL;
 +      List   *coordlist = NIL;
 +
 +      /*
 +       * Length of message is caused by:
 +       * - Message header = 4bytes
 +       * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
 +       * - List of Coordinators = NumPoolCoords * 4bytes (max)
 +       * - Number of Datanodes sent = 4bytes
 +       * - Number of Coordinators sent = 4bytes
 +       * It is better to send in a same message the list of Co and Dn at the same
 +       * time, this permits to reduce interactions between postmaster and pooler
 +       */
 +      pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
 +
 +      datanodecount = pq_getmsgint(s, 4);
 +      for (i = 0; i < datanodecount; i++)
 +              datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
 +
 +      /* It is possible that no Coordinators are involved in the transaction */
 +      coordcount = pq_getmsgint(s, 4);
 +      for (i = 0; i < coordcount; i++)
 +              coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
 +
 +      pq_getmsgend(s);
 +
 +      Assert(datanodecount >= 0 && coordcount >= 0);
 +
 +      /*
 +       * In case of error agent_acquire_connections will log the error and
 +       * return NULL.
 +       */
 +      fds = agent_acquire_connections(agent, datanodelist, coordlist, &pids);
 +
 +      list_free(datanodelist);
 +      list_free(coordlist);
 +
 +      pool_sendfds(&agent->port, fds, fds ? datanodecount + coordcount : 0);
 +      if (fds)
 +              pfree(fds);
 +
 +      /*
 +       * Also send the PIDs of the remote backend processes serving
 +       * these connections
 +       */
 +      pool_sendpids(&agent->port, pids, pids ? datanodecount + coordcount : 0);
 +      if (pids)
 +              pfree(pids);
 +}
 +
 +static void
 +handle_query_cancel(PoolAgent * agent, StringInfo s)
 +{
 +      int             i;
 +      int             datanodecount, coordcount;
 +      List   *datanodelist = NIL;
 +      List   *coordlist = NIL;
 +
 +      /*
 +       * Length of message is caused by:
 +       * - Message header = 4bytes
 +       * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
 +       * - List of Coordinators = NumPoolCoords * 4bytes (max)
 +       * - Number of Datanodes sent = 4bytes
 +       * - Number of Coordinators sent = 4bytes
 +       */
 +      pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
 +
 +      datanodecount = pq_getmsgint(s, 4);
 +      for (i = 0; i < datanodecount; i++)
 +              datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
 +
 +      coordcount = pq_getmsgint(s, 4);
 +      /* It is possible that no Coordinators are involved in the transaction */
 +      for (i = 0; i < coordcount; i++)
 +              coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
 +
 +      pq_getmsgend(s);
 +
 +      cancel_query_on_connections(agent, datanodelist, coordlist);
 +      list_free(datanodelist);
 +      list_free(coordlist);
 +
 +      /* Send success result */
 +      pool_sendres(&agent->port, QUERY_CANCEL_COMPLETED);
 +}
 +
 +/*
 + * Handle messages to agent
 + */
 +static void
 +agent_handle_input(PoolAgent * agent, StringInfo s)
 +{
 +      /* read byte from the buffer (and recv if empty) */
 +      int     qtype = pool_getbyte(&agent->port);
 +
 +      /*
 +       * We can have multiple messages, so handle them all
 +       */
 +      for (;;)
 +      {
 +              /*
 +               * During a pool cleaning, Abort, Connect and Get Connections messages
 +               * are not allowed on pooler side.
 +               * It avoids to have new backends taking connections
 +               * while remaining transactions are aborted during FORCE and then
 +               * Pools are being shrinked.
 +               */
 +              if (is_pool_locked && (qtype == 'a' || qtype == 'c' || qtype == 'g'))
 +                      elog(WARNING,"Pool operation cannot run during pool lock");
 +
 +              elog(DEBUG1, "Pooler is handling command %c from %d", (char) qtype, agent->pid);
 +
 +              switch (qtype)
 +              {
 +                      case 'a':                       /* ABORT */
 +                              handle_abort(agent, s);
 +                              break;
 +                      case 'c':                       /* CONNECT */
 +                              handle_connect(agent, s);
 +                              break;
 +                      case 'd':                       /* DISCONNECT */
 +                              pool_getmessage(&agent->port, s, 4);
 +                              agent_destroy(agent);
 +                              pq_getmsgend(s);
 +                              break;
 +                      case 'f':                       /* CLEAN CONNECTION */
 +                              handle_clean_connection(agent, s);
 +                              break;
 +                      case 'g':                       /* GET CONNECTIONS */
 +                              handle_get_connections(agent, s);
 +                              break;
 +
 +                      case 'h':                       /* Cancel SQL Command in progress on specified connections */
 +                              handle_query_cancel(agent, s);
 +                              break;
 +                      case 'o':                       /* Lock/unlock pooler */
 +                              pool_getmessage(&agent->port, s, 8);
 +                              is_pool_locked = pq_getmsgint(s, 4);
 +                              pq_getmsgend(s);
 +                              break;
 +                      case 'p':                       /* Reload connection info */
 +                              pool_getmessage(&agent->port, s, 4);
 +                              pq_getmsgend(s);
 +
 +                              /* First update all the pools */
 +                              reload_database_pools(agent);
 +                              break;
 +                      case 'R':                       /* Refresh connection info */
 +                              /*
 +                               */
 +                              pool_getmessage(&agent->port, s, 4);
 +                              pq_getmsgend(s);
 +
 +                              pool_sendres(&agent->port, refresh_database_pools(agent));
 +                              break;
 +                      case 'P':                       /* Ping connection info */
 +                              /*
 +                               * Ping unhealthy nodes in the pools. If any of the
 +                               * nodes come up, update SHARED memory to
 +                               * indicate the same.
 +                               */
 +                              pool_getmessage(&agent->port, s, 4);
 +                              pq_getmsgend(s);
 +
 +                              /* Ping all the pools */
 +                              PoolPingNodes();
 +
 +                              break;
 +                      case 'q':                       /* Check connection info consistency */
 +                              pool_getmessage(&agent->port, s, 4);
 +                              pq_getmsgend(s);
 +
 +                              /* Check cached info consistency */
 +                              pool_sendres(&agent->port, node_info_check(agent));
 +                              break;
 +                      case 'r':                       /* RELEASE CONNECTIONS */
 +                              {
 +                                      bool destroy;
 +
 +                                      pool_getmessage(&agent->port, s, 8);
 +                                      destroy = (bool) pq_getmsgint(s, 4);
 +                                      pq_getmsgend(s);
 +                                      agent_release_connections(agent, destroy);
 +                              }
 +                              break;
 +                      case EOF:                       /* EOF */
 +                              agent_destroy(agent);
 +                              return;
 +                      default:                        /* protocol violation */
 +                              agent_destroy(agent);
 +                              ereport(WARNING,
 +                                      (errmsg("agent protocol violation, received byte %c", qtype)));
 +                              return;
 +              }
 +
 +              /*
 +               * check if there are more data in the buffer (but don't recv
 +               * additional data), to avoid reading from a closed connection
 +               *
 +               * XXX I wonder whether this is correct, because it means we
 +               * won't call agent_destroy() in this case (unlike when handling
 +               * the message in the switch above).
 +               */
 +              if ((qtype = pool_pollbyte(&agent->port)) == EOF)
 +                      break;
 +      }
 +}
 +
 +/*
 + * acquire connection
 + */
 +static int *
 +agent_acquire_connections(PoolAgent *agent, List *datanodelist,
 +              List *coordlist, int **pids)
 +{
 +      int                     i;
 +      int                *result;
 +      ListCell   *nodelist_item;
 +      MemoryContext oldcontext;
 +
 +      Assert(agent);
 +
 +      /* Check if pooler can accept those requests */
 +      if (list_length(datanodelist) > agent->num_dn_connections ||
 +                      list_length(coordlist) > agent->num_coord_connections)
 +      {
 +              elog(LOG, "agent_acquire_connections called with invalid arguments -"
 +                              "list_length(datanodelist) %d, num_dn_connections %d,"
 +                              "list_length(coordlist) %d, num_coord_connections %d",
 +                              list_length(datanodelist), agent->num_dn_connections,
 +                              list_length(coordlist), agent->num_coord_connections);
 +              return NULL;
 +      }
 +
 +      /*
 +       * Allocate memory
 +       * File descriptors of Datanodes and Coordinators are saved in the same array,
 +       * This array will be sent back to the postmaster.
 +       * It has a length equal to the length of the Datanode list
 +       * plus the length of the Coordinator list.
 +       * Datanode fds are saved first, then Coordinator fds are saved.
 +       */
 +      result = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
 +      if (result == NULL)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +
 +      *pids = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
 +      if (*pids == NULL)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +      }
 +
 +      /*
 +       * There are possible memory allocations in the core pooler, we want
 +       * these allocations in the contect of the database pool
 +       */
 +      oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
 +
 +
 +      /* Initialize result */
 +      i = 0;
 +      /* Save in array fds of Datanodes first */
 +      foreach(nodelist_item, datanodelist)
 +      {
 +              int                     node = lfirst_int(nodelist_item);
 +
 +              /* Acquire from the pool if none */
 +              if (agent->dn_connections[node] == NULL)
 +              {
 +                      PGXCNodePoolSlot *slot = acquire_connection(agent->pool,
 +                                                                                                              agent->dn_conn_oids[node]);
 +
 +                      /* Handle failure */
 +                      if (slot == NULL)
 +                      {
 +                              pfree(result);
 +                              MemoryContextSwitchTo(oldcontext);
 +                              elog(LOG, "Pooler could not open a connection to node %d",
 +                                              agent->dn_conn_oids[node]);
 +                              return NULL;
 +                      }
 +
 +                      /* Store in the descriptor */
 +                      agent->dn_connections[node] = slot;
 +
 +                      /*
 +                       * Update newly-acquired slot with session parameters.
 +                       * Local parameters are fired only once BEGIN has been launched on
 +                       * remote nodes.
 +                       */
 +              }
 +
 +              result[i] = PQsocket((PGconn *) agent->dn_connections[node]->conn);
 +              (*pids)[i++] = ((PGconn *) agent->dn_connections[node]->conn)->be_pid;
 +      }
 +
 +      /* Save then in the array fds for Coordinators */
 +      foreach(nodelist_item, coordlist)
 +      {
 +              int                     node = lfirst_int(nodelist_item);
 +
 +              /* Acquire from the pool if none */
 +              if (agent->coord_connections[node] == NULL)
 +              {
 +                      PGXCNodePoolSlot *slot = acquire_connection(agent->pool, agent->coord_conn_oids[node]);
 +
 +                      /* Handle failure */
 +                      if (slot == NULL)
 +                      {
 +                              pfree(result);
 +                              MemoryContextSwitchTo(oldcontext);
 +                              elog(LOG, "Pooler could not open a connection to node %d",
 +                                              agent->coord_conn_oids[node]);
 +                              return NULL;
 +                      }
 +
 +                      /* Store in the descriptor */
 +                      agent->coord_connections[node] = slot;
 +
 +                      /*
 +                       * Update newly-acquired slot with session parameters.
 +                       * Local parameters are fired only once BEGIN has been launched on
 +                       * remote nodes.
 +                       */
 +              }
 +
 +              result[i] = PQsocket((PGconn *) agent->coord_connections[node]->conn);
 +              (*pids)[i++] = ((PGconn *) agent->coord_connections[node]->conn)->be_pid;
 +      }
 +
 +      MemoryContextSwitchTo(oldcontext);
 +
 +      return result;
 +}
 +
 +/*
 + * Cancel query
 + */
 +static int
 +cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
 +{
 +      ListCell        *nodelist_item;
 +      char            errbuf[256];
 +      int             nCount;
 +      bool            bRet;
 +
 +      nCount = 0;
 +
 +      if (agent == NULL)
 +              return nCount;
 +
 +      /* Send cancel on Datanodes first */
 +      foreach(nodelist_item, datanodelist)
 +      {
 +              int     node = lfirst_int(nodelist_item);
 +
 +              if(node < 0 || node >= agent->num_dn_connections)
 +                      continue;
 +
 +              if (agent->dn_connections == NULL)
 +                      break;
 +
 +              if (!agent->dn_connections[node])
 +                      continue;
 +
 +              elog(DEBUG1, "Canceling query on connection to remote node %d, remote pid %d",
 +                              agent->dn_conn_oids[node],
 +                              ((PGconn *) agent->dn_connections[node]->conn)->be_pid);
 +              bRet = PQcancel((PGcancel *) agent->dn_connections[node]->xc_cancelConn, errbuf, sizeof(errbuf));
 +              if (bRet != false)
 +              {
 +                      elog(DEBUG1, "Cancelled query on connection to remote node %d, remote pid %d",
 +                                      agent->dn_conn_oids[node],
 +                                      ((PGconn *) agent->dn_connections[node]->conn)->be_pid);
 +                      nCount++;
 +              }
 +      }
 +
 +      /* Send cancel to Coordinators too, e.g. if DDL was in progress */
 +      foreach(nodelist_item, coordlist)
 +      {
 +              int     node = lfirst_int(nodelist_item);
 +
 +              if(node < 0 || node >= agent->num_coord_connections)
 +                      continue;
 +
 +              if (agent->coord_connections == NULL)
 +                      break;
 +
 +              if (!agent->coord_connections[node])
 +                      continue;
 +
 +              elog(DEBUG1, "Canceling query on connection to remote node %d, remote pid %d",
 +                              agent->coord_conn_oids[node],
 +                              ((PGconn *) agent->coord_connections[node]->conn)->be_pid);
 +              bRet = PQcancel((PGcancel *) agent->coord_connections[node]->xc_cancelConn, errbuf, sizeof(errbuf));
 +              if (bRet != false)
 +              {
 +                      elog(DEBUG1, "Cancelled query on connection to remote node %d, remote pid %d",
 +                                      agent->coord_conn_oids[node],
 +                                      ((PGconn *) agent->coord_connections[node]->conn)->be_pid);
 +                      nCount++;
 +              }
 +      }
 +
 +      return nCount;
 +}
 +
 +/*
 + * Return connections back to the pool
 + */
 +void
 +PoolManagerReleaseConnections(bool force)
 +{
 +      char msgtype = 'r';
 +      int n32;
 +      int msglen = 8;
 +
 +      /* If disconnected from pooler all the connections already released */
 +      if (!poolHandle)
 +              return;
 +
 +      elog(DEBUG1, "Returning connections back to the pool");
 +
 +      /* Message type */
 +      pool_putbytes(&poolHandle->port, &msgtype, 1);
 +
 +      /* Message length */
 +      n32 = htonl(msglen);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +
 +      /* Lock information */
 +      n32 = htonl((int) force);
 +      pool_putbytes(&poolHandle->port, (char *) &n32, 4);
 +      pool_flush(&poolHandle->port);
 +}
 +
 +/*
 + * Cancel Query
 + */
 +void
 +PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
 +{
 +      uint32          n32;
 +      /*
 +       * Buffer contains the list of both Coordinator and Datanodes, as well
 +       * as the number of connections
 +       */
 +      uint32          buf[2 + dn_count + co_count];
 +      int             i;
 +
 +      if (poolHandle == NULL)
 +              return;
 +
 +      if (dn_count == 0 && co_count == 0)
 +              return;
 +
 +      if (dn_count != 0 && dn_list == NULL)
 +              return;
 +
 +      if (co_count != 0 && co_list == NULL)
 +              return;
 +
 +      /* Insert the list of Datanodes in buffer */
 +      n32 = htonl((uint32) dn_count);
 +      buf[0] = n32;
 +
 +      for (i = 0; i < dn_count;)
 +      {
 +              n32 = htonl((uint32) dn_list[i++]);
 +              buf[i] = n32;
 +      }
 +
 +      /* Insert the list of Coordinators in buffer */
 +      n32 = htonl((uint32) co_count);
 +      buf[dn_count + 1] = n32;
 +
 +      /* Not necessary to send to pooler a request if there is no Coordinator */
 +      if (co_count != 0)
 +      {
 +              for (i = dn_count + 1; i < (dn_count + co_count + 1);)
 +              {
 +                      n32 = htonl((uint32) co_list[i - (dn_count + 1)]);
 +                      buf[++i] = n32;
 +              }
 +      }
 +      pool_putmessage(&poolHandle->port, 'h', (char *) buf, (2 + dn_count + co_count) * sizeof(uint32));
 +      pool_flush(&poolHandle->port);
 +
 +      /* Receive result message */
 +      if (pool_recvres(&poolHandle->port) != QUERY_CANCEL_COMPLETED)
 +              ereport(WARNING,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("Query cancel not completed")));
 +}
 +
 +/*
 + * Release connections for Datanodes and Coordinators
 + */
 +static void
 +agent_release_connections(PoolAgent *agent, bool force_destroy)
 +{
 +      MemoryContext oldcontext;
 +      int                     i;
 +
 +      if (!agent->dn_connections && !agent->coord_connections)
 +              return;
 +      if (!force_destroy && cluster_ex_lock_held)
 +      {
 +              elog(LOG, "Not releasing connection with cluster lock");
 +              return;
 +      }
 +
 +      /*
 +       * There are possible memory allocations in the core pooler, we want
 +       * these allocations in the contect of the database pool
 +       */
 +      oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
 +
 +      /*
 +       * Remaining connections are assumed to be clean.
 +       * First clean up for Datanodes
 +       */
 +      for (i = 0; i < agent->num_dn_connections; i++)
 +      {
 +              PGXCNodePoolSlot *slot = agent->dn_connections[i];
 +
 +              /*
 +               * Release connection.
 +               * If connection has temporary objects on it, destroy connection slot.
 +               */
 +              if (slot)
 +                      release_connection(agent->pool, slot, agent->dn_conn_oids[i], force_destroy);
 +              agent->dn_connections[i] = NULL;
 +              elog(DEBUG1, "Released connection to node %d", agent->dn_conn_oids[i]);
 +      }
 +      /* Then clean up for Coordinator connections */
 +      for (i = 0; i < agent->num_coord_connections; i++)
 +      {
 +              PGXCNodePoolSlot *slot = agent->coord_connections[i];
 +
 +              /*
 +               * Release connection.
 +               * If connection has temporary objects on it, destroy connection slot.
 +               */
 +              if (slot)
 +                      release_connection(agent->pool, slot, agent->coord_conn_oids[i], force_destroy);
 +              agent->coord_connections[i] = NULL;
 +              elog(DEBUG1, "Released connection to node %d", agent->coord_conn_oids[i]);
 +      }
 +
 +      /*
 +       * Released connections are now in the pool and we may want to close
 +       * them eventually. Update the oldest_idle value to reflect the latest
 +       * last access time if not already updated..
 +       */
 +      if (!force_destroy && agent->pool->oldest_idle == (time_t) 0)
 +              agent->pool->oldest_idle = time(NULL);
 +
 +      MemoryContextSwitchTo(oldcontext);
 +}
 +
 +/*
 + * Create new empty pool for a database.
 + * By default Database Pools have a size null so as to avoid interactions
 + * between PGXC nodes in the cluster (Co/Co, Dn/Dn and Co/Dn).
 + * Pool is increased at the first GET_CONNECTION message received.
 + * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
 + * error and POOL_WEXIST if poll for this database already exist.
 + */
 +static DatabasePool *
 +create_database_pool(const char *database, const char *user_name, const char *pgoptions)
 +{
 +      MemoryContext   oldcontext;
 +      MemoryContext   dbcontext;
 +      DatabasePool   *databasePool;
 +      HASHCTL                 hinfo;
 +
 +      elog(DEBUG1, "Creating a connection pool for database %s, user %s,"
 +                      " with pgoptions %s", database, user_name, pgoptions);
 +
 +      dbcontext = AllocSetContextCreate(PoolerCoreContext,
 +                                                                        "DB Context",
 +                                                                        ALLOCSET_DEFAULT_MINSIZE,
 +                                                                        ALLOCSET_DEFAULT_INITSIZE,
 +                                                                        ALLOCSET_DEFAULT_MAXSIZE);
 +      oldcontext = MemoryContextSwitchTo(dbcontext);
 +      /* Allocate memory */
 +      databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
 +      if (!databasePool)
 +      {
 +              /* out of memory */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +              return NULL;
 +      }
 +
 +      databasePool->mcxt = dbcontext;
 +       /* Copy the database name */
 +      databasePool->database = pstrdup(database);
 +       /* Copy the user name */
 +      databasePool->user_name = pstrdup(user_name);
 +      /* Reset the oldest_idle value */
 +      databasePool->oldest_idle = (time_t) 0;
 +       /* Copy the pgoptions */
 +      databasePool->pgoptions = pstrdup(pgoptions);
 +
 +      if (!databasePool->database)
 +      {
 +              /* out of memory */
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_OUT_OF_MEMORY),
 +                               errmsg("out of memory")));
 +              pfree(databasePool);
 +              return NULL;
 +      }
 +
 +      /* Init next reference */
 +      databasePool->next = NULL;
 +
 +      /* Init node hashtable */
 +      MemSet(&hinfo, 0, sizeof(hinfo));
 +
 +      hinfo.keysize = sizeof(Oid);
 +      hinfo.entrysize = sizeof(PGXCNodePool);
 +      hinfo.hcxt = dbcontext;
 +
 +      databasePool->nodePools = hash_create("Node Pool", MaxDataNodes + MaxCoords,
 +                                                                                &hinfo,
 +                                                                                HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
 +
 +      MemoryContextSwitchTo(oldcontext);
 +
 +      /* Insert into the list */
 +      insert_database_pool(databasePool);
 +
 +      return databasePool;
 +}
 +
 +
 +/*
 + * Destroy the pool and free memory
 + */
 +static int
 +destroy_database_pool(const char *database, const char *user_name)
 +{
 +      DatabasePool *databasePool;
 +
 +      elog(DEBUG1, "Destroy a connection pool to database %s, user %s",
 +                      database, user_name);
 +
 +      /* Delete from the list */
 +      databasePool = remove_database_pool(database, user_name);
 +      if (databasePool)
 +      {
 +              HASH_SEQ_STATUS hseq_status;
 +              PGXCNodePool   *nodePool;
 +
 +              hash_seq_init(&hseq_status, databasePool->nodePools);
 +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
 +              {
 +                      destroy_node_pool(nodePool);
 +              }
 +              /* free allocated memory */
 +              MemoryContextDelete(databasePool->mcxt);
 +              return 1;
 +      }
 +      return 0;
 +}
 +
 +
 +/*
 + * Insert new database pool to the list
 + */
 +static void
 +insert_database_pool(DatabasePool *databasePool)
 +{
 +      Assert(databasePool);
 +
 +      /* Reference existing list or null the tail */
 +      if (databasePools)
 +              databasePool->next = databasePools;
 +      else
 +              databasePool->next = NULL;
 +
 +      /* Update head pointer */
 +      databasePools = databasePool;
 +}
 +
 +/*
 + * reload_database_pools
 + *    rebuild connection information for all database pools
 + *
 + * A database pool is reloaded as follows for each remote node:
 + *
 + * - node pool is deleted if the node has been deleted from catalog.
 + *   Subsequently all its connections are dropped.
 + *
 + * - node pool is deleted if its port or host information is changed.
 + *   Subsequently all its connections are dropped.
 + *
 + * - node pool is kept unchanged with existing connection information
 + *   is not changed. However its index position in node pool is changed
 + *   according to the alphabetical order of the node name in new
 + *   cluster configuration.
 + *
 + * Backend sessions are responsible to reconnect to the pooler to update
 + * their agent with newest connection information.
 + *
 + * The session invocating connection information reload is reconnected
 + * and uploaded automatically after database pool reload. Other server
 + * sessions are signaled to reconnect to pooler and update their
 + * connection information separately.
 + *
 + * During reload process done internally on pooler, pooler is locked
 + * to forbid new connection requests.
 + */
 +static void
 +reload_database_pools(PoolAgent *agent)
 +{
 +      DatabasePool *databasePool;
 +
 +      elog(DEBUG1, "Reloading database pools");
 +
 +      /*
 +       * Release node connections if any held. It is not guaranteed client session
 +       * does the same so don't ever try to return them to pool and reuse
 +       */
 +      agent_release_connections(agent, true);
 +
 +      /* Forget previously allocated node info */
 +      MemoryContextReset(agent->mcxt);
 +
 +      /* and allocate new */
 +      PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
 +                                      &agent->num_coord_connections, &agent->num_dn_connections, false);
 +
 +      agent->coord_connections = (PGXCNodePoolSlot **)
 +                      palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
 +      agent->dn_connections = (PGXCNodePoolSlot **)
 +                      palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
 +
 +      /*
 +       * Scan the list and destroy any altered pool. They will be recreated
 +       * upon subsequent connection acquisition.
 +       */
 +      databasePool = databasePools;
 +      while (databasePool)
 +      {
 +              /* Update each database pool slot with new connection information */
 +              HASH_SEQ_STATUS hseq_status;
 +              PGXCNodePool   *nodePool;
 +
 +              hash_seq_init(&hseq_status, databasePool->nodePools);
 +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
 +              {
 +                      char *connstr_chk = build_node_conn_str(nodePool->nodeoid, databasePool);
 +
 +                      if (connstr_chk == NULL || strcmp(connstr_chk, nodePool->connstr))
 +                      {
 +                              /* Node has been removed or altered */
 +                              destroy_node_pool(nodePool);
 +                              hash_search(databasePool->nodePools, &nodePool->nodeoid,
 +                                                      HASH_REMOVE, NULL);
 +                      }
 +
 +                      if (connstr_chk)
 +                              pfree(connstr_chk);
 +              }
 +
 +              databasePool = databasePool->next;
 +      }
 +}
 +
 +/*
 + * refresh_database_pools
 + *            refresh information for all database pools
 + *
 + * Connection information refresh concerns all the database pools.
 + * A database pool is refreshed as follows for each remote node:
 + *
 + * - node pool is deleted if its port or host information is changed.
 + *   Subsequently all its connections are dropped.
 + *
 + * If any other type of activity is found, we error out.
 + *
 + * XXX I don't see any cases that would error out. Isn't the comment
 + * simply obsolete?
 + */
 +static int
 +refresh_database_pools(PoolAgent *agent)
 +{
 +      DatabasePool *databasePool;
 +      Oid                        *coOids;
 +      Oid                        *dnOids;
 +      int                             numCo;
 +      int                             numDn;
 +      int                     res = POOL_REFRESH_SUCCESS;
 +
 +      elog(LOG, "Refreshing database pools");
 +
 +      /*
 +       * re-check if agent's node information matches current contents of the
 +       * shared memory table.
 +       */
 +      PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
 +
 +      if (agent->num_coord_connections != numCo ||
 +                      agent->num_dn_connections != numDn ||
 +                      memcmp(agent->coord_conn_oids, coOids, numCo * sizeof(Oid)) ||
 +                      memcmp(agent->dn_conn_oids, dnOids, numDn * sizeof(Oid)))
 +              res = POOL_REFRESH_FAILED;
 +
 +      /* Release palloc'ed memory */
 +      pfree(coOids);
 +      pfree(dnOids);
 +
 +      /*
 +       * Scan the list and destroy any altered pool. They will be recreated
 +       * upon subsequent connection acquisition.
 +       */
 +      databasePool = databasePools;
 +      while (res == POOL_REFRESH_SUCCESS && databasePool)
 +      {
 +              HASH_SEQ_STATUS hseq_status;
 +              PGXCNodePool   *nodePool;
 +
 +              hash_seq_init(&hseq_status, databasePool->nodePools);
 +              while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
 +              {
 +                      char *connstr_chk = build_node_conn_str(nodePool->nodeoid, databasePool);
 +
 +                      /*
 +                       * Since we re-checked the numbers above, we should not get
 +                       * the case of an ADDED or a DELETED node here..
 +                       */
 +                      if (connstr_chk == NULL)
 +                      {
 +                              elog(LOG, "Found a deleted node (%u)", nodePool->nodeoid);
 +                              hash_seq_term(&hseq_status);
 +                              res = POOL_REFRESH_FAILED;
 +                              break;
 +                      }
 +
 +                      if (strcmp(connstr_chk, nodePool->connstr))
 +                      {
 +                              elog(LOG, "Found an altered node (%u)", nodePool->nodeoid);
 +                              /*
 +                               * Node has been altered. First remove
 +                               * all references to this node from ALL the
 +                               * agents before destroying it..
 +                               */
 +                              if (!remove_all_agent_references(nodePool->nodeoid))
 +                              {
 +                                      res = POOL_REFRESH_FAILED;
 +                                      break;
 +                              }
 +
 +                              destroy_node_pool(nodePool);
 +                              hash_search(databasePool->nodePools, &nodePool->nodeoid,
 +                                                      HASH_REMOVE, NULL);
 +                      }
 +
 +                      if (connstr_chk)
 +                              pfree(connstr_chk);
 +              }
 +
 +              databasePool = databasePool->next;
 +      }
 +      return res;
 +}
 +
 +static bool
 +remove_all_agent_references(Oid nodeoid)
 +{
 +      int i, j;
 +      bool res = true;
 +
 +      /*
 +       * Identify if it's a coordinator or datanode first
 +       * and get its index
 +       */
 +      for (i = 1; i <= agentCount; i++)
 +      {
 +              bool found = false;
 +
 +              PoolAgent *agent = poolAgents[i - 1];
 +              for (j = 0; j < agent->num_dn_connections; j++)
 +              {
 +                      if (agent->dn_conn_oids[j] == nodeoid)
 +                      {
 +                              found = true;
 +                              break;
 +                      }
 +              }
 +              if (found)
 +              {
 +                      PGXCNodePoolSlot *slot = agent->dn_connections[j];
 +                      if (slot)
 +                              release_connection(agent->pool, slot, agent->dn_conn_oids[j], false);
 +                      agent->dn_connections[j] = NULL;
 +              }
 +              else
 +              {
 +                      for (j = 0; j < agent->num_coord_connections; j++)
 +                      {
 +                              if (agent->coord_conn_oids[j] == nodeoid)
 +                              {
 +                                      found = true;
 +                                      break;
 +                              }
 +                      }
 +                      if (found)
 +                      {
 +                              PGXCNodePoolSlot *slot = agent->coord_connections[j];
 +                              if (slot)
 +                                      release_connection(agent->pool, slot, agent->coord_conn_oids[j], true);
 +                              agent->coord_connections[j] = NULL;
 +                      }
 +                      else
 +                      {
 +                              elog(LOG, "Node not found! (%u)", nodeoid);
 +                              res = false;
 +                      }
 +              }
 +      }
 +      return res;
 +}
 +
 +/*
 + * Find pool for specified database and username in the list
 + */
 +static DatabasePool *
 +find_database_pool(const char *database, const char *user_name, const char *pgoptions)
 +{
 +      DatabasePool *databasePool;
 +
 +      /* Scan the list */
 +      databasePool = databasePools;
 +      while (databasePool)
 +      {
 +              if (strcmp(database, databasePool->database) == 0 &&
 +                      strcmp(user_name, databasePool->user_name) == 0 &&
 +                      strcmp(pgoptions, databasePool->pgoptions) == 0)
 +                      break;
 +              databasePool = databasePool->next;
 +      }
 +      return databasePool;
 +}
 +
 +
 +/*
 + * Remove pool for specified database from the list
 + */
 +static DatabasePool *
 +remove_database_pool(const char *database, const char *user_name)
 +{
 +      DatabasePool *databasePool,
 +                         *prev;
 +
 +      /* Scan the list */
 +      databasePool = databasePools;
 +      prev = NULL;
 +      while (databasePool)
 +      {
 +
 +              /* if match break the loop and return */
 +              if (strcmp(database, databasePool->database) == 0 &&
 +                      strcmp(user_name, databasePool->user_name) == 0)
 +                      break;
 +              prev = databasePool;
 +              databasePool = databasePool->next;
 +      }
 +
 +      /* if found */
 +      if (databasePool)
 +      {
 +
 +              /* Remove entry from chain or update head */
 +              if (prev)
 +                      prev->next = databasePool->next;
 +              else
 +                      databasePools = databasePool->next;
 +
 +
 +              databasePool->next = NULL;
 +      }
 +      return databasePool;
 +}
 +
 +/*
 + * Acquire connection
 + */
 +static PGXCNodePoolSlot *
 +acquire_connection(DatabasePool *dbPool, Oid node)
 +{
 +      PGXCNodePool       *nodePool;
 +      PGXCNodePoolSlot   *slot;
 +
 +      Assert(dbPool);
 +
 +      nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
 +                                                                                      NULL);
 +
 +      /*
 +       * When a Coordinator pool is initialized by a Coordinator Postmaster,
 +       * it has a NULL size and is below minimum size that is 1
 +       * This is to avoid problems of connections between Coordinators
 +       * when creating or dropping Databases.
 +       */
 +      if (nodePool == NULL || nodePool->freeSize == 0)
 +              nodePool = grow_pool(dbPool, node);
 +
 +      slot = NULL;
 +      /* Check available connections */
 +      while (nodePool && nodePool->freeSize > 0)
 +      {
 +              int                     poll_result;
 +
 +              slot = nodePool->slot[--(nodePool->freeSize)];
 +
 +      retry:
 +              if (PQsocket((PGconn *) slot->conn) > 0)
 +              {
 +                      /*
 +                       * Make sure connection is ok, destroy connection slot if there is a
 +                       * problem.
 +                       */
 +                      poll_result = pqReadReady((PGconn *) slot->conn);
 +
 +                      if (poll_result == 0)
 +                              break;          /* ok, no data */
 +                      else if (poll_result < 0)
 +                      {
 +                              if (errno == EAGAIN || errno == EINTR)
 +                                      goto retry;
 +
 +                              elog(WARNING, "Error in checking connection, errno = %d", errno);
 +                      }
 +                      else
 +                              elog(WARNING, "Unexpected data on connection, cleaning.");
 +              }
 +
 +              destroy_slot(slot);
 +              slot = NULL;
 +
 +              /* Decrement current max pool size */
 +              (nodePool->size)--;
 +              /* Ensure we are not below minimum size */
 +              nodePool = grow_pool(dbPool, node);
 +      }
 +
 +      if (slot == NULL)
 +      {
 +              elog(WARNING, "can not connect to node %u", node);
 +
 +              /*
 +               * before returning, also update the shared health
 +               * status field to indicate that this node is down
 +               */
 +              if (!PgxcNodeUpdateHealth(node, false))
 +                      elog(WARNING, "Could not update health status of node %u", node);
 +              else
 +                      elog(WARNING, "Health map updated to reflect DOWN node (%u)", node);
 +      }
 +      else
 +              PgxcNodeUpdateHealth(node, true);
 +
 +      return slot;
 +}
 +
 +
 +/*
 + * release connection from specified pool and slot
 + */
 +static void
 +release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 +                                 Oid node, bool force_destroy)
 +{
 +      PGXCNodePool *nodePool;
 +
 +      Assert(dbPool);
 +      Assert(slot);
 +
 +      nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
 +                                                                                      NULL);
 +      if (nodePool == NULL)
 +      {
 +              /*
 +               * The node may be altered or dropped.
 +               * In any case the slot is no longer valid.
 +               */
 +              destroy_slot(slot);
 +              return;
 +      }
 +
 +      /* return or discard */
 +      if (!force_destroy)
 +      {
 +              /* Insert the slot into the array and increase pool size */
 +              nodePool->slot[(nodePool->freeSize)++] = slot;
 +              slot->released = time(NULL);
 +      }
 +      else
 +      {
 +              elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
 +              destroy_slot(slot);
 +              /* Decrement pool size */
 +              (nodePool->size)--;
 +              /* Ensure we are not below minimum size */
 +              grow_pool(dbPool, node);
 +      }
 +}
 +
 +
 +/*
 + * Increase database pool size, create new if does not exist
 + */
 +static PGXCNodePool *
 +grow_pool(DatabasePool *dbPool, Oid node)
 +{
 +      /* if error try to release idle connections and try again */
 +      bool                    tryagain = true;
 +      PGXCNodePool   *nodePool;
 +      bool                    found;
 +
 +      Assert(dbPool);
 +
 +      nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
 +                                                                                      HASH_ENTER, &found);
 +      nodePool->connstr = build_node_conn_str(node, dbPool);
 +      if (!nodePool->connstr)
 +      {
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INTERNAL_ERROR),
 +                               errmsg("could not build connection string for node %u", node)));
 +      }
 +
 +      if (!found)
 +      {
 +              nodePool->slot = (PGXCNodePoolSlot **) palloc0(MaxPoolSize * sizeof(PGXCNodePoolSlot *));
 +              if (!nodePool->slot)
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                       errmsg("out of memory")));
 +              }
 +              nodePool->freeSize = 0;
 +              nodePool->size = 0;
 +      }
 +
 +      while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)
 +      {
 +              PGXCNodePoolSlot *slot;
 +
 +              /* Allocate new slot */
 +              slot = (PGXCNodePoolSlot *) palloc(sizeof(PGXCNodePoolSlot));
 +              if (slot == NULL)
 +              {
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_OUT_OF_MEMORY),
 +                                       errmsg("out of memory")));
 +              }
 +
 +              /* If connection fails, be sure that slot is destroyed cleanly */
 +              slot->xc_cancelConn = NULL;
 +
 +              /* Establish connection */
 +              slot->conn = PGXCNodeConnect(nodePool->connstr);
 +              if (!PGXCNodeConnected(slot->conn))
 +              {
 +                      ereport(LOG,
 +                                      (errcode(ERRCODE_CONNECTION_FAILURE),
 +                                       errmsg("failed to connect to node, connection string (%s),"
 +                                                " connection error (%s)",
 +                                                nodePool->connstr,
 +                                                PQerrorMessage((PGconn*) slot->conn))));
 +                      destroy_slot(slot);
 +                      /*
 +                       * If we failed to connect probably number of connections on the
 +                       * target node reached max_connections. Try and release idle
 +                       * connections and try again.
 +                       * We do not want to enter endless loop here and run maintenance
 +                       * procedure only once.
 +                       * It is not safe to run the maintenance procedure if no connections
 +                       * from that pool currently in use - the node pool may be destroyed
 +                       * in that case.
 +                       */
 +                      if (tryagain && nodePool->size > nodePool->freeSize)
 +                      {
 +                              pools_maintenance();
 +                              tryagain = false;
 +                              continue;
 +                      }
 +                      break;
 +              }
 +
 +              slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
 +              slot->released = time(NULL);
 +              if (dbPool->oldest_idle == (time_t) 0)
 +                      dbPool->oldest_idle = slot->released;
 +
 +              /* Insert at the end of the pool */
 +              nodePool->slot[(nodePool->freeSize)++] = slot;
 +
 +              /* Increase count of pool size */
 +              (nodePool->size)++;
 +              elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
 +                       nodePool->size,
 +                       nodePool->connstr);
 +      }
 +      return nodePool;
 +}
 +
 +
 +/*
 + * Destroy pool slot
 + */
 +static void
 +destroy_slot(PGXCNodePoolSlot *slot)
 +{
 +      if (!slot)
 +              return;
 +
 +      PQfreeCancel((PGcancel *)slot->xc_cancelConn);
 +      PGXCNodeClose(slot->conn);
 +      pfree(slot);
 +}
 +
 +
 +/*
 + * Destroy node pool
 + */
 +static void
 +destroy_node_pool(PGXCNodePool *node_pool)
 +{
 +      int                     i;
 +
 +      if (!node_pool)
 +              return;
 +
 +      /*
 +       * At this point all agents using connections from this pool should be already closed
 +       * If this not the connections to the Datanodes assigned to them remain open, this will
 +       * consume Datanode resources.
 +       */
 +      elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
 +               node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
 +      if (node_pool->connstr)
 +              pfree(node_pool->connstr);
 +
 +      if (node_pool->slot)
 +      {
 +              for (i = 0; i < node_pool->freeSize; i++)
 +                      destroy_slot(node_pool->slot[i]);
 +              pfree(node_pool->slot);
 +      }
 +}
 +
 +
 +/*
 + * Main handling loop
 + */
 +static void
 +PoolerLoop(void)
 +{
 +      StringInfoData  input_message;
 +      time_t                  last_maintenance = (time_t) 0;
 +      int                             maintenance_timeout;
 +      struct pollfd   *pool_fd;
 +
 +#ifdef HAVE_UNIX_SOCKETS
 +      if (Unix_socket_directories)
 +      {
 +              char       *rawstring;
 +              List       *elemlist;
 +              ListCell   *l;
 +              int                     success = 0;
 +
 +              /* Need a modifiable copy of Unix_socket_directories */
 +              rawstring = pstrdup(Unix_socket_directories);
 +
 +              /* Parse string into list of directories */
 +              if (!SplitDirectoriesString(rawstring, ',', &elemlist))
 +              {
 +                      /* syntax error in list */
 +                      ereport(FATAL,
 +                                      (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 +                                       errmsg("invalid list syntax in parameter \"%s\"",
 +                                                      "unix_socket_directories")));
 +              }
 +
 +              foreach(l, elemlist)
 +              {
 +                      char       *socketdir = (char *) lfirst(l);
 +                      int                     saved_errno;
 +
 +                      /* Connect to the pooler */
 +                      server_fd = pool_listen(PoolerPort, socketdir);
 +                      if (server_fd < 0)
 +                      {
 +                              saved_errno = errno;
 +                              ereport(WARNING,
 +                                              (errmsg("could not create Unix-domain socket in directory \"%s\", errno %d, server_fd %d",
 +                                                              socketdir, saved_errno, server_fd)));
 +                      }
 +                      else
 +                      {
 +                              success++;
 +                      }
 +              }
 +
 +              if (!success && elemlist != NIL)
 +                      ereport(ERROR,
 +                                      (errmsg("failed to start listening on Unix-domain socket for pooler: %m")));
 +
 +              list_free_deep(elemlist);
 +              pfree(rawstring);
 +      }
 +#endif
 +
 +      pool_fd = (struct pollfd *) palloc((MaxConnections + 1) * sizeof(struct pollfd));
 +
 +      if (server_fd == -1)
 +      {
 +              /* log error */
 +              return;
 +      }
 +
 +      initStringInfo(&input_message);
 +
 +      pool_fd[0].fd = server_fd;
 +      pool_fd[0].events = POLLIN; 
 +
 +      for (;;)
 +      {
 +
 +              int                     retval;
 +              int                     i;
 +
 +              /*
 +               * Emergency bailout if postmaster has died.  This is to avoid the
 +               * necessity for manual cleanup of all postmaster children.
 +               */
 +              if (!PostmasterIsAlive())
 +                      exit(1);
 +
 +              /* watch for incoming messages */
 +              for (i = 1; i <= agentCount; i++)
 +              {
 +                      PoolAgent *agent = poolAgents[i - 1];
 +                      int sockfd = Socket(agent->port);
 +                      pool_fd[i].fd = sockfd;
 +                      pool_fd[i].events = POLLIN;
 +              }
 +
 +              if (PoolMaintenanceTimeout > 0)
 +              {
 +                      int                             timeout_val;
 +                      double                  timediff;
 +
 +                      /*
 +                       * Decide the timeout value based on when the last
 +                       * maintenance activity was carried out. If the last
 +                       * maintenance was done quite a while ago schedule the select
 +                       * with no timeout. It will serve any incoming activity
 +                       * and if there's none it will cause the maintenance
 +                       * to be scheduled as soon as possible
 +                       */
 +                      timediff = difftime(time(NULL), last_maintenance);
 +
 +                      if (timediff > PoolMaintenanceTimeout)
 +                              timeout_val = 0;
 +                      else
 +                              timeout_val = PoolMaintenanceTimeout - rint(timediff);
 +
 +                      maintenance_timeout = timeout_val * 1000;
 +              }
 +              else
 +                      maintenance_timeout = -1;
 +              /*
 +               * Emergency bailout if postmaster has died.  This is to avoid the
 +               * necessity for manual cleanup of all postmaster children.
 +               */
 +              if (!PostmasterIsAlive())
 +                      exit(1);
 +
 +              /*
 +               * Process any requests or signals received recently.
 +               */
 +              if (got_SIGHUP)
 +              {
 +                      got_SIGHUP = false;
 +                      ProcessConfigFile(PGC_SIGHUP);
 +              }
 +
 +              if (shutdown_requested)
 +              {
 +                      for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
 +                      {
 +                              PoolAgent  *agent = poolAgents[i];
 +                              agent_destroy(agent);
 +                      }
 +
 +                      while (databasePools)
 +                              if (destroy_database_pool(databasePools->database,
 +                                                                                databasePools->user_name) == 0)
 +                                      break;
 +                      
 +                      close(server_fd);
 +                      exit(0);
 +              }
 +
 +              /* wait for event */
 +              retval = poll(pool_fd, agentCount + 1, maintenance_timeout);
 +              if (retval < 0)
 +              {
 +                      if (errno == EINTR || errno == EAGAIN)
 +                              continue;
 +                      elog(FATAL, "poll returned with error %d", retval);
 +              }
 +
 +              if (retval > 0)
 +              {
 +                      /*
 +                       * Agent may be removed from the array while processing
 +                       * and trailing items are shifted, so scroll downward
 +                       * to avoid problem
 +                       */
 +                      for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
 +                      {
 +                              PoolAgent *agent = poolAgents[i];
 +                              int sockfd = Socket(agent->port);
 +
 +                              if ((sockfd == pool_fd[i + 1].fd) && 
 +                                              (pool_fd[i + 1].revents & POLLIN))
 +                                      agent_handle_input(agent, &input_message);
 +                      }
 +
 +                      if (pool_fd[0].revents & POLLIN)
 +                              agent_create();
 +              }
 +              else if (retval == 0)
 +              {
 +                      /* maintenance timeout */
 +                      pools_maintenance();
 +                      PoolPingNodes();
 +                      last_maintenance = time(NULL);
 +              }
 +      }
 +}
 +
 +/*
 + * Clean Connection in all Database Pools for given Datanode and Coordinator list
 + */
 +int
 +clean_connection(List *node_discard, const char *database, const char *user_name)
 +{
 +      DatabasePool *databasePool;
 +      int                     res = CLEAN_CONNECTION_COMPLETED;
 +
 +      databasePool = databasePools;
 +
 +      while (databasePool)
 +      {
 +              ListCell *lc;
 +
 +              if ((database && strcmp(database, databasePool->database)) ||
 +                              (user_name && strcmp(user_name, databasePool->user_name)))
 +              {
 +                      /* The pool does not match to request, skip */
 +                      databasePool = databasePool->next;
 +                      continue;
 +              }
 +
 +              /*
 +               * Clean each requested node pool
 +               */
 +              foreach(lc, node_discard)
 +              {
 +                      PGXCNodePool *nodePool;
 +                      Oid node = lfirst_oid(lc);
 +
 +                      nodePool = hash_search(databasePool->nodePools, &node, HASH_FIND,
 +                                                                 NULL);
 +
 +                      if (nodePool)
 +                      {
 +                              /* Check if connections are in use */
 +                              if (nodePool->freeSize < nodePool->size)
 +                              {
 +                                      elog(WARNING, "Pool of Database %s is using Datanode %u connections",
 +                                                              databasePool->database, node);
 +                                      res = CLEAN_CONNECTION_NOT_COMPLETED;
 +                              }
 +
 +                              /* Destroy connections currently in Node Pool */
 +                              if (nodePool->slot)
 +                              {
 +                                      int i;
 +                                      for (i = 0; i < nodePool->freeSize; i++)
 +                                              destroy_slot(nodePool->slot[i]);
 +                              }
 +                              nodePool->size -= nodePool->freeSize;
 +                              nodePool->freeSize = 0;
 +                      }
 +              }
 +
 +              databasePool = databasePool->next;
 +      }
 +
 +      /* Release lock on Pooler, to allow transactions to connect again. */
 +      is_pool_locked = false;
 +      return res;
 +}
 +
 +/*
 + * Take a Lock on Pooler.
 + * Abort PIDs registered with the agents for the given database.
 + * Send back to client list of PIDs signaled to watch them.
 + */
 +int *
 +abort_pids(int *len, int pid, const char *database, const char *user_name)
 +{
 +      int *pids = NULL;
 +      int i = 0;
 +      int count;
 +
 +      Assert(!is_pool_locked);
 +      Assert(agentCount > 0);
 +
 +      is_pool_locked = true;
 +
 +      pids = (int *) palloc((agentCount - 1) * sizeof(int));
 +
 +      /* Send a SIGTERM signal to all processes of Pooler agents except this one */
 +      for (count = 0; count < agentCount; count++)
 +      {
 +              if (poolAgents[count]->pid == pid)
 +                      continue;
 +
 +              if (database && strcmp(poolAgents[count]->pool->database, database) != 0)
 +                      continue;
 +
 +              if (user_name && strcmp(poolAgents[count]->pool->user_name, user_name) != 0)
 +                      continue;
 +
 +              if (kill(poolAgents[count]->pid, SIGTERM) < 0)
 +                      elog(ERROR, "kill(%ld,%d) failed: %m",
 +                                              (long) poolAgents[count]->pid, SIGTERM);
 +
 +              pids[i++] = poolAgents[count]->pid;
 +      }
 +
 +      *len = i;
 +
 +      return pids;
 +}
 +
 +/*
 + *
 + */
 +static void
 +pooler_die(SIGNAL_ARGS)
 +{
 +      shutdown_requested = true;
 +}
 +
 +
 +/*
 + *
 + */
 +static void
 +pooler_quickdie(SIGNAL_ARGS)
 +{
 +      PG_SETMASK(&BlockSig);
 +      exit(2);
 +}
 +
 +
 +static void
 +pooler_sighup(SIGNAL_ARGS)
 +{
 +      got_SIGHUP = true;
 +}
 +
 +/*
 + * Given node identifier, dbname and user name build connection string.
 + * Get node connection details from the shared memory node table
 + */
 +static char *
 +build_node_conn_str(Oid node, DatabasePool *dbPool)
 +{
 +      NodeDefinition *nodeDef;
 +      char               *connstr;
 +
 +      nodeDef = PgxcNodeGetDefinition(node);
 +      if (nodeDef == NULL)
 +      {
 +              /* No such definition, node is dropped? */
 +              return NULL;
 +      }
 +
 +      connstr = PGXCNodeConnStr(NameStr(nodeDef->nodehost),
 +                                                        nodeDef->nodeport,
 +                                                        dbPool->database,
 +                                                        dbPool->user_name,
 +                                                        dbPool->pgoptions,
 +                                                        IS_PGXC_COORDINATOR ? "coordinator" : "datanode",
 +                                                        PGXCNodeName);
 +      pfree(nodeDef);
 +
 +      return connstr;
 +}
 +
 +/*
 + * Check all pooled connections, and close which have been released more then
 + * PooledConnKeepAlive seconds ago.
 + * Return true if shrink operation closed all the connections and pool can be
 + * ddestroyed, false if there are still connections or pool is in use.
 + */
 +static bool
 +shrink_pool(DatabasePool *pool)
 +{
 +      time_t                  now = time(NULL);
 +      HASH_SEQ_STATUS hseq_status;
 +      PGXCNodePool   *nodePool;
 +      int                     i;
 +      bool                    empty = true;
 +
 +      /* Negative PooledConnKeepAlive disables automatic connection cleanup */
 +      if (PoolConnKeepAlive < 0)
 +              return false;
 +
 +      pool->oldest_idle = (time_t) 0;
 +      hash_seq_init(&hseq_status, pool->nodePools);
 +      while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
 +      {
 +              /* Go thru the free slots and destroy those that are free too long */
 +              for (i = 0; i < nodePool->freeSize; )
 +              {
 +                      PGXCNodePoolSlot *slot = nodePool->slot[i];
 +
 +                      if (difftime(now, slot->released) > PoolConnKeepAlive)
 +                      {
 +                              /* connection is idle for long, close it */
 +                              destroy_slot(slot);
 +                              /* reduce pool size and total number of connections */
 +                              (nodePool->freeSize)--;
 +                              (nodePool->size)--;
 +                              /* move last connection in place, if not at last already */
 +                              if (i < nodePool->freeSize)
 +                                      nodePool->slot[i] = nodePool->slot[nodePool->freeSize];
 +                      }
 +                      else
 +                      {
 +                              if (pool->oldest_idle == (time_t) 0 ||
 +                                              difftime(pool->oldest_idle, slot->released) > 0)
 +                                      pool->oldest_idle = slot->released;
 +
 +                              i++;
 +                      }
 +              }
 +              if (nodePool->size > 0)
 +                      empty = false;
 +              else
 +              {
 +                      destroy_node_pool(nodePool);
 +                      hash_search(pool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL);
 +              }
 +      }
 +
 +      /*
 +       * Last check, if any active agent is referencing the pool do not allow to
 +       * destroy it, because there will be a problem if session wakes up and try
 +       * to get a connection from non existing pool.
 +       * If all such sessions will eventually disconnect the pool will be
 +       * destroyed during next maintenance procedure.
 +       */
 +      if (empty)
 +      {
 +              for (i = 0; i < agentCount; i++)
 +              {
 +                      if (poolAgents[i]->pool == pool)
 +                              return false;
 +              }
 +      }
 +
 +      return empty;
 +}
 +
 +
 +/*
 + * Scan connection pools and release connections which are idle for long.
 + * If pool gets empty after releasing connections it is destroyed.
 + */
 +static void
 +pools_maintenance(void)
 +{
 +      DatabasePool   *prev = NULL;
 +      DatabasePool   *curr = databasePools;
 +      time_t                  now = time(NULL);
 +      int                             count = 0;
 +
 +      /* Iterate over the pools */
 +      while (curr)
 +      {
 +              /*
 +               * If current pool has connections to close and it is emptied after
 +               * shrink remove the pool and free memory.
 +               * Otherwithe move to next pool.
 +               */
 +              if (curr->oldest_idle != (time_t) 0 &&
 +                              difftime(now, curr->oldest_idle) > PoolConnKeepAlive &&
 +                              shrink_pool(curr))
 +              {
 +                      MemoryContext mem = curr->mcxt;
 +                      curr = curr->next;
 +                      if (prev)
 +                              prev->next = curr;
 +                      else
 +                              databasePools = curr;
 +                      MemoryContextDelete(mem);
 +                      count++;
 +              }
 +              else
 +              {
 +                      prev = curr;
 +                      curr = curr->next;
 +              }
 +      }
 +      elog(DEBUG1, "Pool maintenance, done in %f seconds, removed %d pools",
 +                      difftime(time(NULL), now), count);
 +}
 +
 +bool
 +check_persistent_connections(bool *newval, void **extra, GucSource source)
 +{
 +      if (*newval && IS_PGXC_DATANODE)
 +      {
 +              elog(WARNING, "persistent_datanode_connections = ON is currently not "
 +                              "supported on datanodes - ignoring");
 +              *newval = false;
 +      }
 +      return true;
 +}
index 4fbae5b31b5b3c5b0c3b735910991ec427c7a4a4,0000000000000000000000000000000000000000..a9741f33f6e06fb2d3ba8fbf5eb0749b4cc88231
mode 100644,000000..100644
--- /dev/null
@@@ -1,1791 -1,0 +1,1790 @@@
- static LWLockTranche SharedQueueLocksTranche;
 +/*-------------------------------------------------------------------------
 + *
 + * squeue.c
 + *
 + *      Shared queue is for data exchange in shared memory between sessions,
 + * one of which is a producer, providing data rows. Others are consumer agents -
 + * sessions initiated from other datanodes, the main purpose of them is to read
 + * rows from the shared queue and send then to the parent data node.
 + *    The producer is usually a consumer at the same time, it sends back tuples
 + * to the parent node without putting it to the queue.
 + *
 + * Copyright (c) 2012-2014, TransLattice, Inc.
 + *
 + * IDENTIFICATION
 + *      $$
 + *
 + *
 + *-------------------------------------------------------------------------
 + */
 +
 +#include <sys/time.h>
 +#include "postgres.h"
 +
 +#include "miscadmin.h"
 +#include "access/gtm.h"
 +#include "catalog/pgxc_node.h"
 +#include "commands/prepare.h"
 +#include "executor/executor.h"
 +#include "nodes/pg_list.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/pgxc.h"
 +#include "pgxc/pgxcnode.h"
 +#include "pgxc/squeue.h"
 +#include "storage/latch.h"
 +#include "storage/lwlock.h"
 +#include "storage/shmem.h"
 +#include "utils/hsearch.h"
 +#include "utils/resowner.h"
++#include "pgstat.h"
 +
 +
 +int NSQueues = 64;
 +int SQueueSize = 64;
 +
 +#define LONG_TUPLE -42
 +
 +typedef struct ConsumerSync
 +{
 +      LWLock     *cs_lwlock;          /* Synchronize access to the consumer queue */
 +      Latch           cs_latch;       /* The latch consumer is waiting on */
 +} ConsumerSync;
 +
 +
 +/*
 + * Shared memory structure to store synchronization info to access shared queues
 + */
 +typedef struct SQueueSync
 +{
 +      void       *queue;                      /* NULL if not assigned to any queue */
 +      Latch           sqs_producer_latch; /* the latch producer is waiting on */
 +      ConsumerSync sqs_consumer_sync[0]; /* actual length is MaxDataNodes-1 is
 +                                                                              * not known on compile time */
 +} SQueueSync;
 +
 +/* Both producer and consumer are working */
 +#define CONSUMER_ACTIVE 0
 +/* Producer have finished work successfully and waits for consumer */
 +#define CONSUMER_EOF 1
 +/* Producer encountered error and waits for consumer to disconnect */
 +#define CONSUMER_ERROR 2
 +/* Consumer is finished with the query, OK to unbind */
 +#define CONSUMER_DONE 3
 +
 +
 +/* State of a single consumer */
 +typedef struct
 +{
 +      int                     cs_pid;                 /* Process id of the consumer session */
 +      int                     cs_node;                /* Node id of the consumer parent */
 +      /*
 +       * Queue state. The queue is a cyclic queue where stored tuples in the
 +       * DataRow format, first goes the lengths of the tuple in host format,
 +       * because it never sent over network followed by tuple bytes.
 +       */
 +      int                     cs_ntuples;     /* Number of tuples in the queue */
 +      int                     cs_status;              /* See CONSUMER_* defines above */
 +      char       *cs_qstart;          /* Where consumer queue begins */
 +      int                     cs_qlength;             /* The size of the consumer queue */
 +      int                     cs_qreadpos;    /* The read position in the consumer queue */
 +      int                     cs_qwritepos;   /* The write position in the consumer queue */
 +#ifdef SQUEUE_STAT
 +      long            stat_writes;
 +      long            stat_reads;
 +      long            stat_buff_writes;
 +      long            stat_buff_reads;
 +      long            stat_buff_returns;
 +#endif
 +} ConsState;
 +
 +/* Shared queue header */
 +typedef struct SQueueHeader
 +{
 +      char            sq_key[SQUEUE_KEYSIZE]; /* Hash entry key should be at the
 +                                                               * beginning of the hash entry */
 +      int                     sq_pid;                 /* Process id of the producer session */
 +      int                     sq_nodeid;              /* Node id of the producer parent */
 +      SQueueSync *sq_sync;        /* Associated sinchronization objects */
 +      int                     sq_refcnt;              /* Reference count to this entry */
 +#ifdef SQUEUE_STAT
 +      bool            stat_finish;
 +      long            stat_paused;
 +#endif
 +      int                     sq_nconsumers;  /* Number of consumers */
 +      ConsState       sq_consumers[0];/* variable length array */
 +} SQueueHeader;
 +
 +
 +/*
 + * Hash table where all shared queues are stored. Key is the queue name, value
 + * is SharedQueue
 + */
 +static HTAB *SharedQueues = NULL;
 +static LWLockPadded *SQueueLocks = NULL;
-               SharedQueueLocksTranche.name = "Shared Queue Locks";
-               SharedQueueLocksTranche.array_base = SQueueLocks;
-               SharedQueueLocksTranche.array_stride = sizeof(LWLockPadded);
 +
 +/*
 + * Pool of synchronization items
 + */
 +static void *SQueueSyncs;
 +
 +#define SQUEUE_SYNC_SIZE \
 +      (sizeof(SQueueSync) + (MaxDataNodes-1) * sizeof(ConsumerSync))
 +
 +#define GET_SQUEUE_SYNC(idx) \
 +      ((SQueueSync *) (((char *) SQueueSyncs) + (idx) * SQUEUE_SYNC_SIZE))
 +
 +#define SQUEUE_HDR_SIZE(nconsumers) \
 +      (sizeof(SQueueHeader) + (nconsumers) * sizeof(ConsState))
 +
 +#define QUEUE_FREE_SPACE(cstate) \
 +      ((cstate)->cs_ntuples > 0 ? \
 +              ((cstate)->cs_qreadpos >= (cstate)->cs_qwritepos ? \
 +                      (cstate)->cs_qreadpos - (cstate)->cs_qwritepos : \
 +                      (cstate)->cs_qlength + (cstate)->cs_qreadpos \
 +                                                               - (cstate)->cs_qwritepos) \
 +              : (cstate)->cs_qlength)
 +
 +#define QUEUE_WRITE(cstate, len, buf) \
 +      do \
 +      { \
 +              if ((cstate)->cs_qwritepos + (len) <= (cstate)->cs_qlength) \
 +              { \
 +                      memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, len); \
 +                      (cstate)->cs_qwritepos += (len); \
 +                      if ((cstate)->cs_qwritepos == (cstate)->cs_qlength) \
 +                              (cstate)->cs_qwritepos = 0; \
 +              } \
 +              else \
 +              { \
 +                      int part = (cstate)->cs_qlength - (cstate)->cs_qwritepos; \
 +                      memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, part); \
 +                      (cstate)->cs_qwritepos = (len) - part; \
 +                      memcpy((cstate)->cs_qstart, (buf) + part, (cstate)->cs_qwritepos); \
 +              } \
 +      } while(0)
 +
 +
 +#define QUEUE_READ(cstate, len, buf) \
 +      do \
 +      { \
 +              if ((cstate)->cs_qreadpos + (len) <= (cstate)->cs_qlength) \
 +              { \
 +                      memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, len); \
 +                      (cstate)->cs_qreadpos += (len); \
 +                      if ((cstate)->cs_qreadpos == (cstate)->cs_qlength) \
 +                              (cstate)->cs_qreadpos = 0; \
 +              } \
 +              else \
 +              { \
 +                      int part = (cstate)->cs_qlength - (cstate)->cs_qreadpos; \
 +                      memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, part); \
 +                      (cstate)->cs_qreadpos = (len) - part; \
 +                      memcpy((buf) + part, (cstate)->cs_qstart, (cstate)->cs_qreadpos); \
 +              } \
 +      } while(0)
 +
 +
 +static bool sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow);
 +static void sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
 +                                                         ConsumerSync *sync);
 +
 +/*
 + * SharedQueuesInit
 + *    Initialize the reference on the shared memory hash table where all shared
 + * queues are stored. Invoked during postmaster initialization.
 + */
 +void
 +SharedQueuesInit(void)
 +{
 +      HASHCTL info;
 +      int             hash_flags;
 +      bool    found;
 +
 +      info.keysize = SQUEUE_KEYSIZE;
 +      info.entrysize = SQUEUE_SIZE;
 +
 +      /*
 +       * Create hash table of fixed size to avoid running out of
 +       * SQueueSyncs
 +       */
 +      hash_flags = HASH_ELEM | HASH_FIXED_SIZE;
 +
 +      SharedQueues = ShmemInitHash("Shared Queues", NUM_SQUEUES,
 +                                                               NUM_SQUEUES, &info, hash_flags);
 +
 +      /*
 +       * Synchronization stuff is in separate structure because we need to
 +       * initialize all items now while in the postmaster.
 +       * The structure is actually an array, each array entry is assigned to
 +       * each instance of SharedQueue in use.
 +       */
 +      SQueueSyncs = ShmemInitStruct("Shared Queues Sync",
 +                                                                SQUEUE_SYNC_SIZE * NUM_SQUEUES,
 +                                                                &found);
 +      if (!found)
 +      {
 +              int     i, l;
 +              int     nlocks = (NUM_SQUEUES * (MaxDataNodes-1));
 +              bool    foundLocks;
 +
 +              /* Initialize LWLocks for queues */
 +              SQueueLocks = (LWLockPadded *) ShmemInitStruct("Shared Queue Locks",
 +                                              sizeof(LWLockPadded) * nlocks, &foundLocks);
 +
 +              /* either both syncs and locks, or none of them */
 +              Assert(! foundLocks);
 +
-               LWLockRegisterTranche(LWTRANCHE_SHARED_QUEUES, &SharedQueueLocksTranche);
 +              /* Register the trannche tranche in the main tranches array */
-                       WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
++              LWLockRegisterTranche(LWTRANCHE_SHARED_QUEUES, "Shared Queue Locks");
 +
 +              l = 0;
 +              for (i = 0; i < NUM_SQUEUES; i++)
 +              {
 +                      SQueueSync *sqs = GET_SQUEUE_SYNC(i);
 +                      int                     j;
 +
 +                      sqs->queue = NULL;
 +                      InitSharedLatch(&sqs->sqs_producer_latch);
 +                      for (j = 0; j < MaxDataNodes-1; j++)
 +                      {
 +                              InitSharedLatch(&sqs->sqs_consumer_sync[j].cs_latch);
 +
 +                              LWLockInitialize(&(SQueueLocks[l]).lock,
 +                                                               LWTRANCHE_SHARED_QUEUES);
 +
 +                              sqs->sqs_consumer_sync[j].cs_lwlock = &(SQueueLocks[l++]).lock;
 +                      }
 +              }
 +      }
 +}
 +
 +
 +Size
 +SharedQueueShmemSize(void)
 +{
 +      Size sqs_size;
 +
 +      sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE);
 +      return add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, SQUEUE_SIZE));
 +}
 +
 +/*
 + * SharedQueueAcquire
 + *     Reserve a named shared queue for future data exchange between processes
 + * supplying tuples to remote Datanodes. Invoked when a remote query plan is
 + * registered on the Datanode. The number of consumers is known at this point,
 + * so shared queue may be formatted during reservation. The first process that
 + * is acquiring the shared queue on the Datanode does the formatting.
 + */
 +void
 +SharedQueueAcquire(const char *sqname, int ncons)
 +{
 +      bool            found;
 +      SharedQueue sq;
 +      int trycount = 0;
 +
 +      Assert(IsConnFromDatanode());
 +      Assert(ncons > 0);
 +
 +tryagain:
 +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
 +
 +      sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_ENTER, &found);
 +      if (!sq)
 +              ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
 +                              errmsg("out of shared queue, please increase shared_queues")));
 +
 +      /* First process acquiring queue should format it */
 +      if (!found)
 +      {
 +              int             qsize;   /* Size of one queue */
 +              int             i;
 +              char   *heapPtr;
 +
 +              elog(DEBUG1, "Create a new SQueue %s and format it for %d consumers", sqname, ncons);
 +
 +              /* Initialize the shared queue */
 +              sq->sq_pid = 0;
 +              sq->sq_nodeid = -1;
 +              sq->sq_refcnt = 1;
 +#ifdef SQUEUE_STAT
 +              sq->stat_finish = false;
 +              sq->stat_paused = 0;
 +#endif
 +              /*
 +               * Assign sync object (latches to wait on)
 +               * XXX We may want to optimize this and do smart search instead of
 +               * iterating the array.
 +               */
 +              for (i = 0; i < NUM_SQUEUES; i++)
 +              {
 +                      SQueueSync *sqs = GET_SQUEUE_SYNC(i);
 +                      if (sqs->queue == NULL)
 +                      {
 +                              sqs->queue = (void *) sq;
 +                              sq->sq_sync = sqs;
 +                              break;
 +                      }
 +              }
 +
 +              Assert(sq->sq_sync != NULL);
 +
 +              sq->sq_nconsumers = ncons;
 +              /* Determine queue size for a single consumer */
 +              qsize = (SQUEUE_SIZE - SQUEUE_HDR_SIZE(sq->sq_nconsumers)) / sq->sq_nconsumers;
 +
 +              heapPtr = (char *) sq;
 +              /* Skip header */
 +              heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers);
 +              /* Set up consumer queues */
 +              for (i = 0; i < ncons; i++)
 +              {
 +                      ConsState *cstate = &(sq->sq_consumers[i]);
 +
 +                      cstate->cs_pid = 0;
 +                      cstate->cs_node = -1;
 +                      cstate->cs_ntuples = 0;
 +                      cstate->cs_status = CONSUMER_ACTIVE;
 +                      cstate->cs_qstart = heapPtr;
 +                      cstate->cs_qlength = qsize;
 +                      cstate->cs_qreadpos = 0;
 +                      cstate->cs_qwritepos = 0;
 +                      heapPtr += qsize;
 +              }
 +              Assert(heapPtr <= ((char *) sq) + SQUEUE_SIZE);
 +      }
 +      else
 +      {
 +              int i;
 +
 +              elog(DEBUG1, "Found an existing SQueue %s - (sq_pid:%d, sq_nodeid:%d,"
 +                      " sq_nconsumers:%d",
 +                      sqname, sq->sq_pid, sq->sq_nodeid, sq->sq_nconsumers);
 +
 +              for (i = 0; i < sq->sq_nconsumers; i++)
 +              {
 +                      elog(DEBUG1, "SQueue %s, consumer (%d) information (cs_pid:%d,"
 +                                      " cs_node:%d, cs_ntuples:%d, cs_status: %d",
 +                                      sqname, i,
 +                                      sq->sq_consumers[i].cs_pid, 
 +                                      sq->sq_consumers[i].cs_node, 
 +                                      sq->sq_consumers[i].cs_ntuples, 
 +                                      sq->sq_consumers[i].cs_status); 
 +              }
 +
 +              /*
 +               * A race condition is possible here. The previous operation might  use
 +               * the same Shared Queue name if that was different execution of the
 +               * same Portal. So here we should try to determine if that Shared Queue
 +               * belongs to this execution or that is not-yet-released Shared Queue
 +               * of previous operation.
 +               * Though at the moment I am not sure, but I believe the BIND stage is
 +               * only happening after completion of ACQUIRE stage, so it is enough
 +               * to verify the producer (the very first node that binds) is not bound
 +               * yet. If it is bound, sleep for a moment and try again. No reason to
 +               * sleep longer, the producer needs just a quantum of CPU time to UNBIND
 +               * itself.
 +               */
 +              if (sq->sq_pid != 0)
 +              {
 +                      int                     i;
 +                      bool            old_squeue = true;
 +
 +                      PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
 +                                      &PGXC_PARENT_NODE_TYPE);
 +                      for (i = 0; i < sq->sq_nconsumers; i++)
 +                      {
 +                              ConsState *cstate = &(sq->sq_consumers[i]);
 +                              if (cstate->cs_node == PGXC_PARENT_NODE_ID)
 +                              {
 +                                      SQueueSync *sqsync = sq->sq_sync;
 +
 +                                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
 +                                                                LW_EXCLUSIVE);
 +                                      /* verify status */
 +                                      if (cstate->cs_status != CONSUMER_DONE)
 +                                              old_squeue = false;
 +
 +                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +                                      break;
 +                              }
 +                      }
 +                      if (old_squeue)
 +                      {
 +                              LWLockRelease(SQueuesLock);
 +                              pg_usleep(1000000L);
 +                              elog(DEBUG1, "SQueue race condition, give the old producer to "
 +                                              "finish the work and retry again");
 +                              trycount++;
 +                              if (trycount >= 10)
 +                                      elog(ERROR, "Couldn't resolve SQueue race condition after"
 +                                                      " %d tries", trycount);
 +                              goto tryagain;
 +                      }
 +              }
 +              sq->sq_refcnt++;
 +      }
 +      LWLockRelease(SQueuesLock);
 +}
 +
 +
 +/*
 + * SharedQueueBind
 + *    Bind to the shared queue specified by sqname either as a consumer or as a
 + * producer. The first process that binds to the shared queue becomes a producer
 + * and receives the consumer map, others become consumers and receive queue
 + * indexes to read tuples from.
 + * The consNodes int list identifies the nodes involved in the current step.
 + * The distNodes int list describes result distribution of the current step.
 + * The consNodes should be a subset of distNodes.
 + * The myindex and consMap parameters are binding results. If caller process
 + * is bound to the query as a producer myindex is set to -1 and index of the
 + * each consumer (order number in the consNodes) is stored to the consMap array
 + * at the position of the node in the distNodes. For the producer node
 + * SQ_CONS_SELF is stored, nodes from distNodes list which are not members of
 + * consNodes or if it was reported they won't read results, they are represented
 + * as SQ_CONS_NONE.
 + */
 +SharedQueue
 +SharedQueueBind(const char *sqname, List *consNodes,
 +                                                                 List *distNodes, int *myindex, int *consMap)
 +{
 +      bool            found;
 +      SharedQueue sq;
 +
 +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
 +
 +      PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
 +                      &PGXC_PARENT_NODE_TYPE);
 +      sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
 +      if (!found)
 +              elog(PANIC, "Shared queue %s not found", sqname);
 +      if (sq->sq_pid == 0)
 +      {
 +              /* Producer */
 +              int             i;
 +              ListCell *lc;
 +
 +              Assert(consMap);
 +
 +              elog(DEBUG1, "Bind node %s to squeue of step %s as a producer",
 +                       PGXC_PARENT_NODE, sqname);
 +
 +              /* Initialize the shared queue */
 +              sq->sq_pid = MyProcPid;
 +              sq->sq_nodeid = PGXC_PARENT_NODE_ID;
 +              OwnLatch(&sq->sq_sync->sqs_producer_latch);
 +
 +              i = 0;
 +              foreach(lc, distNodes)
 +              {
 +                      int                     nodeid = lfirst_int(lc);
 +
 +                      /*
 +                       * Producer won't go to shared queue to hand off tuple to itself,
 +                       * so we do not need to create queue for that entry.
 +                       */
 +                      if (nodeid == PGXC_PARENT_NODE_ID)
 +                      {
 +                              /* Producer must be in the consNodes list */
 +                              Assert(list_member_int(consNodes, nodeid));
 +                              elog(DEBUG1, "SQueue %s consumer @%d is set to self",
 +                                              sqname, i);
 +                              consMap[i++] = SQ_CONS_SELF;
 +                      }
 +                      /*
 +                       * This node may connect as a consumer, store consumer id to the map
 +                       * and initialize consumer queue
 +                       */
 +                      else if (list_member_int(consNodes, nodeid))
 +                      {
 +                              ConsState  *cstate;
 +                              int             j;
 +
 +                              for (j = 0; j < sq->sq_nconsumers; j++)
 +                              {
 +                                      cstate = &(sq->sq_consumers[j]);
 +                                      if (cstate->cs_node == nodeid)
 +                                      {
 +                                              /* The process already reported that queue won't read */
 +                                              elog(DEBUG1, "Node %d of SQueue %s is released already "
 +                                                              "at consumer %d, cs_status %d",
 +                                                       nodeid, sqname, j, cstate->cs_status);
 +                                              consMap[i++] = SQ_CONS_NONE;
 +                                              break;
 +                                      }
 +                                      else if (cstate->cs_node == -1)
 +                                      {
 +                                              /* found unused slot, assign the consumer to it */
 +                                              elog(DEBUG1, "Node %d of SQueue %s is bound at consumer "
 +                                                              "%d, cs_status %d",
 +                                                              nodeid, sqname, j, cstate->cs_status);
 +                                              consMap[i++] = j;
 +                                              cstate->cs_node = nodeid;
 +                                              break;
 +                                      }
 +                              }
 +                      }
 +                      /*
 +                       * Consumer from this node won't ever connect as upper level step
 +                       * is not executed on the node. Discard resuls that may go to that
 +                       * node, if any.
 +                       */
 +                      else
 +                      {
 +                              elog(DEBUG1, "Node %d of SQueue %s is not in the "
 +                                              "redistribution list and hence would never connect",
 +                                              nodeid, sqname);
 +                              consMap[i++] = SQ_CONS_NONE;
 +                      }
 +              }
 +
 +              if (myindex)
 +                      *myindex = -1;
 +
 +              /*
 +               * Increment the refcnt only when producer binds. This is a bit
 +               * asymmetrical, but the way things are currently setup, a consumer
 +               * though calls SharedQueueBind, never calls SharedQueueUnBind. The
 +               * unbinding is done only by the producer after it waits for all
 +               * consumers to finish.
 +               *
 +               * XXX This ought to be fixed someday to simplify things in Shared
 +               * Queue handling
 +               */ 
 +              sq->sq_refcnt++;
 +      }
 +      else
 +      {
 +              int     nconsumers;
 +              ListCell *lc;
 +
 +              /* Producer should be different process */
 +              Assert(sq->sq_pid != MyProcPid);
 +
 +              elog(DEBUG1, "SQueue %s has a bound producer from node %d, pid %d",
 +                              sqname, sq->sq_nodeid, sq->sq_pid);
 +              elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d", PGXC_PARENT_NODE, sqname, sq->sq_pid);
 +
 +              /* Sanity checks */
 +              Assert(myindex);
 +              *myindex = -1;
 +              /* Ensure the passed in consumer list matches the queue */
 +              nconsumers = 0;
 +              foreach (lc, consNodes)
 +              {
 +                      int             nodeid = lfirst_int(lc);
 +                      int                     i;
 +
 +                      if (nodeid == sq->sq_nodeid)
 +                      {
 +                              /*
 +                               * This node is a producer it should be in the consumer list,
 +                               * but no consumer queue for it
 +                               */
 +                              continue;
 +                      }
 +
 +                      /* find consumer queue for the node */
 +                      for (i = 0; i < sq->sq_nconsumers; i++)
 +                      {
 +                              ConsState *cstate = &(sq->sq_consumers[i]);
 +                              if (cstate->cs_node == nodeid)
 +                              {
 +                                      nconsumers++;
 +                                      if (nodeid == PGXC_PARENT_NODE_ID)
 +                                      {
 +                                              /*
 +                                               * Current consumer queue is that from which current
 +                                               * session will be sending out data rows.
 +                                               * Initialize the queue to let producer know we are
 +                                               * here and runnng.
 +                                               */
 +                                              SQueueSync *sqsync = sq->sq_sync;
 +
 +                                              elog(DEBUG1, "SQueue %s, consumer node %d is same as "
 +                                                              "the parent node", sqname, nodeid);
 +                                              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
 +                                                                        LW_EXCLUSIVE);
 +                                              /* Make sure no consumer bound to the queue already */
 +                                              Assert(cstate->cs_pid == 0);
 +                                              /* make sure the queue is ready to read */
 +                                              Assert(cstate->cs_qlength > 0);
 +                                              /* verify status */
 +                                              if (cstate->cs_status == CONSUMER_ERROR ||
 +                                                              cstate->cs_status == CONSUMER_DONE)
 +                                              {
 +                                                      int status = cstate->cs_status;
 +                                                      /*
 +                                                       * Producer failed by the time the consumer connect.
 +                                                       * Change status to "Done" to allow producer unbind
 +                                                       * and report problem to the parent.
 +                                                       */
 +                                                      cstate->cs_status = CONSUMER_DONE;
 +                                                      /* Producer may be waiting for status change */
 +                                                      SetLatch(&sqsync->sqs_producer_latch);
 +                                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +                                                      LWLockRelease(SQueuesLock);
 +                                                      ereport(ERROR,
 +                                                                      (errcode(ERRCODE_PRODUCER_ERROR),
 +                                                                       errmsg("Producer failed while we were waiting - status was %d", status)));
 +                                              }
 +                                              /*
 +                                               * Any other status is acceptable. Normally it would be
 +                                               * ACTIVE. If producer have had only few rows to emit
 +                                               * and it is already done the status would be EOF.
 +                                               */
 +
 +                                              /* Set up the consumer */
 +                                              cstate->cs_pid = MyProcPid;
 +
 +                                              elog(DEBUG1, "SQueue %s, consumer at %d, status %d - "
 +                                                              "setting up consumer node %d, pid %d",
 +                                                              sqname, i, cstate->cs_status, cstate->cs_node,
 +                                                              cstate->cs_pid);
 +                                              /* return found index */
 +                                              *myindex = i;
 +                                              OwnLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 +                                              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +                                      }
 +                                      else
 +                                              elog(DEBUG1, "SQueue %s, consumer node %d is not same as "
 +                                                              "the parent node %d", sqname, nodeid,
 +                                                              PGXC_PARENT_NODE_ID);
 +                                      break;
 +                              }
 +                      }
 +                      /* Check if entry was found and therefore loop was broken */
 +                      Assert(i < sq->sq_nconsumers);
 +              }
 +              /* Check the consumer is found */
 +              Assert(*myindex != -1);
 +              Assert(sq->sq_nconsumers == nconsumers);
 +      }
 +      LWLockRelease(SQueuesLock);
 +      return sq;
 +}
 +
 +
 +/*
 + * Push data from the local tuplestore to the queue for specified consumer.
 + * Return true if succeeded and the tuplestore is now empty. Return false
 + * if specified queue has not enough room for the next tuple.
 + */
 +static bool
 +SharedQueueDump(SharedQueue squeue, int consumerIdx,
 +                                                 TupleTableSlot *tmpslot, Tuplestorestate *tuplestore)
 +{
 +      ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
 +
 +      elog(DEBUG3, "Dumping SQueue %s data for consumer at %d, "
 +                      "producer - node %d, pid %d, "
 +                      "consumer - node %d, pid %d, status %d",
 +                      squeue->sq_key, consumerIdx,
 +                      squeue->sq_nodeid, squeue->sq_pid,
 +                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +
 +      /* discard stored data if consumer is not active */
 +      if (cstate->cs_status != CONSUMER_ACTIVE)
 +      {
 +              elog(DEBUG3, "Discarding SQueue %s data for consumer at %d not active",
 +                              squeue->sq_key, consumerIdx);
 +              tuplestore_clear(tuplestore);
 +              return true;
 +      }
 +
 +      /*
 +       * Tuplestore does not clear eof flag on the active read pointer, causing
 +       * the store is always in EOF state once reached when there is a single
 +       * read pointer. We do not want behavior like this and workaround by using
 +       * secondary read pointer. Primary read pointer (0) is active when we are
 +       * writing to the tuple store, also it is used to bookmark current position
 +       * when reading to be able to roll back and return just read tuple back to
 +       * the store if we failed to write it out to the queue.
 +       * Secondary read pointer is for reading, and its eof flag is cleared if a
 +       * tuple is written to the store.
 +       */
 +      tuplestore_select_read_pointer(tuplestore, 1);
 +
 +      /* If we have something in the tuplestore try to push this to the queue */
 +      while (!tuplestore_ateof(tuplestore))
 +      {
 +              /* save position */
 +              tuplestore_copy_read_pointer(tuplestore, 1, 0);
 +
 +              /* Try to get next tuple to the temporary slot */
 +              if (!tuplestore_gettupleslot(tuplestore, true, false, tmpslot))
 +              {
 +                      /* false means the tuplestore in EOF state */
 +                      elog(DEBUG3, "Tuplestore for SQueue %s returned EOF",
 +                                      squeue->sq_key);
 +                      break;
 +              }
 +#ifdef SQUEUE_STAT
 +              cstate->stat_buff_reads++;
 +#endif
 +
 +              /* The slot should contain a data row */
 +              Assert(tmpslot->tts_datarow);
 +
 +              /* check if queue has enough room for the data */
 +              if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + tmpslot->tts_datarow->msglen)
 +              {
 +                      /*
 +                       * If stored tuple does not fit empty queue we are entering special
 +                       * procedure of pushing it through.
 +                       */
 +                      if (cstate->cs_ntuples <= 0)
 +                      {
 +                              /*
 +                               * If pushing throw is completed wake up and proceed to next
 +                               * tuple, there could be enough space in the consumer queue to
 +                               * fit more.
 +                               */
 +                              bool done = sq_push_long_tuple(cstate, tmpslot->tts_datarow);
 +
 +                              /*
 +                               * sq_push_long_tuple writes some data anyway, so wake up
 +                               * the consumer.
 +                               */
 +                              SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
 +
 +                              if (done)
 +                                      continue;
 +                      }
 +
 +                      /* Restore read position to get same tuple next time */
 +                      tuplestore_copy_read_pointer(tuplestore, 0, 1);
 +#ifdef SQUEUE_STAT
 +                      cstate->stat_buff_returns++;
 +#endif
 +
 +                      /* We might advance the mark, try to truncate */
 +                      tuplestore_trim(tuplestore);
 +
 +                      /* Prepare for writing, set proper read pointer */
 +                      tuplestore_select_read_pointer(tuplestore, 0);
 +
 +                      /* ... and exit */
 +                      return false;
 +              }
 +              else
 +              {
 +                      /* Enqueue data */
 +                      QUEUE_WRITE(cstate, sizeof(int), (char *) &tmpslot->tts_datarow->msglen);
 +                      QUEUE_WRITE(cstate, tmpslot->tts_datarow->msglen, tmpslot->tts_datarow->msg);
 +
 +                      /* Increment tuple counter. If it was 0 consumer may be waiting for
 +                       * data so try to wake it up */
 +                      if ((cstate->cs_ntuples)++ == 0)
 +                              SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
 +              }
 +      }
 +
 +      /* Remove rows we have just read */
 +      tuplestore_trim(tuplestore);
 +
 +      /* prepare for writes, set read pointer 0 as active */
 +      tuplestore_select_read_pointer(tuplestore, 0);
 +
 +      return true;
 +}
 +
 +
 +/*
 + * SharedQueueWrite
 + *    Write data from the specified slot to the specified queue. If the
 + * tuplestore passed in has tuples try and write them first.
 + * If specified queue is full the tuple is put into the tuplestore which is
 + * created if necessary
 + */
 +void
 +SharedQueueWrite(SharedQueue squeue, int consumerIdx,
 +                                                      TupleTableSlot *slot, Tuplestorestate **tuplestore,
 +                                                      MemoryContext tmpcxt)
 +{
 +      ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
 +      SQueueSync *sqsync = squeue->sq_sync;
 +      LWLockId    clwlock = sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock;
 +      RemoteDataRow datarow;
 +      bool            free_datarow;
 +
 +      Assert(cstate->cs_qlength > 0);
 +
 +      LWLockAcquire(clwlock, LW_EXCLUSIVE);
 +
 +#ifdef SQUEUE_STAT
 +      cstate->stat_writes++;
 +#endif
 +
 +      /*
 +       * If we have anything in the local storage try to dump this first,
 +       * but do not try to dump often to avoid overhead of creating temporary
 +       * tuple slot. It should be OK to dump if queue is half empty.
 +       */
 +      if (*tuplestore)
 +      {
 +              bool dumped = false;
 +
 +              if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
 +              {
 +                      TupleTableSlot *tmpslot;
 +
 +                      tmpslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor);
 +                      dumped = SharedQueueDump(squeue, consumerIdx, tmpslot, *tuplestore);
 +                      ExecDropSingleTupleTableSlot(tmpslot);
 +              }
 +              if (!dumped)
 +              {
 +                      /* No room to even dump local store, append the tuple to the store
 +                       * and exit */
 +#ifdef SQUEUE_STAT
 +                      cstate->stat_buff_writes++;
 +#endif
 +                      LWLockRelease(clwlock);
 +                      tuplestore_puttupleslot(*tuplestore, slot);
 +                      return;
 +              }
 +      }
 +
 +      /* Get datarow from the tuple slot */
 +      if (slot->tts_datarow)
 +      {
 +              /*
 +               * The function ExecCopySlotDatarow always make a copy, but here we
 +               * can optimize and avoid copying the data, so we just get the reference
 +               */
 +              datarow = slot->tts_datarow;
 +              free_datarow = false;
 +      }
 +      else
 +      {
 +              datarow = ExecCopySlotDatarow(slot, tmpcxt);
 +              free_datarow = true;
 +      }
 +      if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + datarow->msglen)
 +      {
 +              /* Not enough room, store tuple locally */
 +              LWLockRelease(clwlock);
 +
 +              /* clean up */
 +              if (free_datarow)
 +                      pfree(datarow);
 +
 +              /* Create tuplestore if does not exist */
 +              if (*tuplestore == NULL)
 +              {
 +                      int                     ptrno;
 +                      char            storename[64];
 +
 +#ifdef SQUEUE_STAT
 +                      elog(DEBUG1, "Start buffering %s node %d, %d tuples in queue, %ld writes and %ld reads so far",
 +                               squeue->sq_key, cstate->cs_node, cstate->cs_ntuples, cstate->stat_writes, cstate->stat_reads);
 +#endif
 +                      *tuplestore = tuplestore_begin_datarow(false, work_mem, tmpcxt);
 +                      /* We need is to be able to remember/restore the read position */
 +                      snprintf(storename, 64, "%s node %d", squeue->sq_key, cstate->cs_node);
 +                      tuplestore_collect_stat(*tuplestore, storename);
 +                      /*
 +                       * Allocate a second read pointer to read from the store. We know
 +                       * it must have index 1, so needn't store that.
 +                       */
 +                      ptrno = tuplestore_alloc_read_pointer(*tuplestore, 0);
 +                      Assert(ptrno == 1);
 +              }
 +
 +#ifdef SQUEUE_STAT
 +              cstate->stat_buff_writes++;
 +#endif
 +              /* Append the slot to the store... */
 +              tuplestore_puttupleslot(*tuplestore, slot);
 +
 +              /* ... and exit */
 +              return;
 +      }
 +      else
 +      {
 +              /* do not supply data to closed consumer */
 +              if (cstate->cs_status == CONSUMER_ACTIVE)
 +              {
 +                      elog(DEBUG3, "SQueue %s, consumer is active, writing data",
 +                                      squeue->sq_key);
 +                      /* write out the data */
 +                      QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
 +                      QUEUE_WRITE(cstate, datarow->msglen, datarow->msg);
 +                      /* Increment tuple counter. If it was 0 consumer may be waiting for
 +                       * data so try to wake it up */
 +                      if ((cstate->cs_ntuples)++ == 0)
 +                              SetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
 +              }
 +              else
 +                      elog(DEBUG2, "SQueue %s, consumer is not active, no need to supply data",
 +                                      squeue->sq_key);
 +
 +              /* clean up */
 +              if (free_datarow)
 +                      pfree(datarow);
 +      }
 +      LWLockRelease(clwlock);
 +}
 +
 +
 +/*
 + * SharedQueueRead
 + *    Read one data row from the specified queue into the provided tupleslot.
 + * Returns true if EOF is reached on the specified consumer queue.
 + * If the queue is empty, behavior is controlled by the canwait parameter.
 + * If canwait is true it is waiting while row is available or EOF or error is
 + * reported, if it is false, the slot is emptied and false is returned.
 + */
 +bool
 +SharedQueueRead(SharedQueue squeue, int consumerIdx,
 +                                                      TupleTableSlot *slot, bool canwait)
 +{
 +      ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
 +      SQueueSync *sqsync = squeue->sq_sync;
 +      RemoteDataRow datarow;
 +      int             datalen;
 +
 +      Assert(cstate->cs_qlength > 0);
 +
 +      LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
 +
 +      Assert(cstate->cs_status != CONSUMER_DONE);
 +      while (cstate->cs_ntuples <= 0)
 +      {
 +              elog(DEBUG3, "SQueue %s, consumer node %d, pid %d, status %d - "
 +                              "no tuples in the queue", squeue->sq_key,
 +                              cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +
 +              if (cstate->cs_status == CONSUMER_EOF)
 +              {
 +                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - "
 +                                      "EOF marked. Informing produer by setting CONSUMER_DONE",
 +                                      squeue->sq_key,
 +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +
 +                      /* Inform producer the consumer have done the job */
 +                      cstate->cs_status = CONSUMER_DONE;
 +                      /* no need to receive notifications */
 +                      DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
 +                      /* producer done the job and no more rows expected, clean up */
 +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
 +                      ExecClearTuple(slot);
 +                      /*
 +                       * notify the producer, it may be waiting while consumers
 +                       * are finishing
 +                       */
 +                      SetLatch(&sqsync->sqs_producer_latch);
 +                      return true;
 +              }
 +              else if (cstate->cs_status == CONSUMER_ERROR)
 +              {
 +                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - "
 +                                      "CONSUMER_ERROR set",
 +                                      squeue->sq_key,
 +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +                      /*
 +                       * There was a producer error while waiting.
 +                       * Release all the locks and report problem to the caller.
 +                       */
 +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
 +                      /*
 +                       * Reporting error will cause transaction rollback and clean up of
 +                       * all portals. We can not mark the portal so it does not access
 +                       * the queue so we should hold it for now. We should prevent queue
 +                       * unbound in between.
 +                       */
 +                      ereport(ERROR,
 +                                      (errcode(ERRCODE_PRODUCER_ERROR),
 +                                       errmsg("Failed to read from SQueue %s, "
 +                                               "consumer (node %d, pid %d, status %d) - "
 +                                               "CONSUMER_ERROR set",
 +                                               squeue->sq_key,
 +                                               cstate->cs_node, cstate->cs_pid, cstate->cs_status)));
 +              }
 +              if (canwait)
 +              {
 +                      /* Prepare waiting on empty buffer */
 +                      ResetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
 +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
 +
 +                      elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
 +                                      "no queued tuples to read, waiting "
 +                                      "for producer to produce more data",
 +                                      squeue->sq_key,
 +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +
 +                      /* Wait for notification about available info */
-                                                               10000L);
++                      WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch,
++                                      WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
++                                      WAIT_EVENT_MQ_INTERNAL);
 +                      /* got the notification, restore lock and try again */
 +                      LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
 +              }
 +              else
 +              {
 +                      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
 +
 +                      elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
 +                                      "no queued tuples to read, caller can't wait ",
 +                                      squeue->sq_key,
 +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +                      ExecClearTuple(slot);
 +                      return false;
 +              }
 +      }
 +
 +      elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
 +                      "%d queued tuples to read",
 +                      squeue->sq_key,
 +                      cstate->cs_node, cstate->cs_pid, cstate->cs_status,
 +                      cstate->cs_ntuples);
 +
 +      /* have at least one row, read it in and store to slot */
 +      QUEUE_READ(cstate, sizeof(int), (char *) (&datalen));
 +      datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datalen);
 +      datarow->msgnode = InvalidOid;
 +      datarow->msglen = datalen;
 +      if (datalen > cstate->cs_qlength - sizeof(int))
 +              sq_pull_long_tuple(cstate, datarow,
 +                                                 &sqsync->sqs_consumer_sync[consumerIdx]);
 +      else
 +              QUEUE_READ(cstate, datalen, datarow->msg);
 +      ExecStoreDataRowTuple(datarow, slot, true);
 +      (cstate->cs_ntuples)--;
 +#ifdef SQUEUE_STAT
 +      cstate->stat_reads++;
 +#endif
 +      /* sanity check */
 +      Assert((cstate->cs_ntuples == 0) == (cstate->cs_qreadpos == cstate->cs_qwritepos));
 +      LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
 +      return false;
 +}
 +
 +
 +/*
 + * Mark specified consumer as closed discarding all input which may already be
 + * in the queue.
 + * If consumerIdx is -1 the producer is cleaned up. Producer need to wait for
 + * consumers before releasing the queue, so if there are yet active consumers,
 + * they are notified about the problem and they should disconnect from the
 + * queue as soon as possible.
 + */
 +void
 +SharedQueueReset(SharedQueue squeue, int consumerIdx)
 +{
 +      SQueueSync *sqsync = squeue->sq_sync;
 +
 +      /* 
 +       * We may have already cleaned up, but then an abort signalled us to clean up.
 +       * Avoid segmentation fault on abort
 +       */
 +      if (!sqsync)
 +              return;
 +
 +      if (consumerIdx == -1)
 +      {
 +              int i;
 +
 +              elog(DEBUG1, "SQueue %s, requested to reset producer node %d, pid %d - "
 +                              "Now also resetting all consumers",
 +                              squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid);
 +
 +              /* check queue states */
 +              for (i = 0; i < squeue->sq_nconsumers; i++)
 +              {
 +                      ConsState *cstate = &squeue->sq_consumers[i];
 +                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
 +
 +                      /*
 +                       * If producer being reset before it is reached the end of the
 +                       * result set, that means consumer probably would not get all
 +                       * the rows and it should report error if the consumer's parent ever
 +                       * try to read. No need to raise error if consumer is just closed.
 +                       * If consumer is done already we do not need to change the status.
 +                       */
 +                      if (cstate->cs_status != CONSUMER_EOF &&
 +                                      cstate->cs_status != CONSUMER_DONE)
 +                      {
 +                              elog(DEBUG1, "SQueue %s, reset consumer at %d, "
 +                                              "consumer node %d, pid %d, status %d - marking CONSUMER_ERROR",
 +                                              squeue->sq_key, i, cstate->cs_node, cstate->cs_pid,
 +                                              cstate->cs_status);
 +
 +                              cstate->cs_status = CONSUMER_ERROR;
 +                              /* discard tuples which may already be in the queue */
 +                              cstate->cs_ntuples = 0;
 +                              /* keep consistent with cs_ntuples*/
 +                              cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
 +
 +                              /* wake up consumer if it is sleeping */
 +                              SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 +                      }
 +                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +              }
 +      }
 +      else
 +      {
 +              ConsState  *cstate = &(squeue->sq_consumers[consumerIdx]);
 +
 +              elog(DEBUG1, "SQueue %s, requested to reset consumer at %d, "
 +                              "consumer node %d, pid %d, status %d",
 +                              squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid,
 +                              cstate->cs_status);
 +
 +              LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock,
 +                                        LW_EXCLUSIVE);
 +
 +              if (cstate->cs_status != CONSUMER_DONE)
 +              {
 +                      elog(DEBUG1, "SQueue %s, consumer at %d, "
 +                              "consumer node %d, pid %d, status %d - marking CONSUMER_DONE",
 +                              squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid,
 +                              cstate->cs_status);
 +
 +                      /* Inform producer the consumer have done the job */
 +                      cstate->cs_status = CONSUMER_DONE;
 +                      /*
 +                       * No longer need to receive notifications. If consumer has not
 +                       * connected the latch is not owned
 +                       */
 +                      if (cstate->cs_pid > 0)
 +                              DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
 +                      /*
 +                       * notify the producer, it may be waiting while consumers
 +                       * are finishing
 +                       */
 +                      SetLatch(&sqsync->sqs_producer_latch);
 +              }
 +
 +              LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
 +      }
 +}
 +
 +
 +/*
 + * Assume that not yet connected consumers won't connect and reset them.
 + * That should allow to Finish/UnBind the queue gracefully and prevent
 + * producer hanging.
 + */
 +void
 +SharedQueueResetNotConnected(SharedQueue squeue)
 +{
 +      SQueueSync *sqsync = squeue->sq_sync;
 +      int result = 0;
 +      int i;
 +
 +      elog(DEBUG1, "SQueue %s, resetting all unconnected consumers",
 +                      squeue->sq_key);
 +
 +      /* check queue states */
 +      for (i = 0; i < squeue->sq_nconsumers; i++)
 +      {
 +              ConsState *cstate = &squeue->sq_consumers[i];
 +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
 +
 +              if (cstate->cs_pid == 0 &&
 +                              cstate->cs_status != CONSUMER_EOF &&
 +                              cstate->cs_status != CONSUMER_DONE)
 +              {
 +                      result++;
 +                      elog(DEBUG1, "SQueue %s, consumer at %d, consumer node %d, pid %d, "
 +                                      "status %d is cancelled - marking CONSUMER_ERROR", squeue->sq_key, i,
 +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +                      cstate->cs_status = CONSUMER_ERROR;
 +                      /* discard tuples which may already be in the queue */
 +                      cstate->cs_ntuples = 0;
 +                      /* keep consistent with cs_ntuples*/
 +                      cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
 +
 +                      /* wake up consumer if it is sleeping */
 +                      SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 +              }
 +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +      }
 +}
 +
 +
 +/*
 + * Determine if producer can safely pause work.
 + * The producer can pause if all consumers have enough data to read while
 + * producer is sleeping.
 + * Obvoius case when the producer can not pause if at least one queue is empty.
 + */
 +bool
 +SharedQueueCanPause(SharedQueue squeue)
 +{
 +      SQueueSync *sqsync = squeue->sq_sync;
 +      bool            result = true;
 +      int             usedspace;
 +      int                     ncons;
 +      int             i;
 +
 +      usedspace = 0;
 +      ncons = 0;
 +      for (i = 0; result && (i < squeue->sq_nconsumers); i++)
 +      {
 +              ConsState *cstate = &(squeue->sq_consumers[i]);
 +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_SHARED);
 +              /*
 +               * Count only consumers that may be blocked.
 +               * If producer has finished scanning and pushing local buffers some
 +               * consumers may be finished already.
 +               */
 +              if (cstate->cs_status == CONSUMER_ACTIVE)
 +              {
 +                      /* can not pause if some queue is empty */
 +                      result = (cstate->cs_ntuples > 0);
 +                      usedspace += (cstate->cs_qwritepos > cstate->cs_qreadpos ?
 +                                                        cstate->cs_qwritepos - cstate->cs_qreadpos :
 +                                                        cstate->cs_qlength + cstate->cs_qwritepos
 +                                                                                               - cstate->cs_qreadpos);
 +                      ncons++;
 +              }
 +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +      }
 +      
 +      if (!ncons)
 +              return false;
 +
 +      /*
 +       * Pause only if average consumer queue is full more then on half.
 +       */
 +      if (result)
 +              result = (usedspace / ncons > squeue->sq_consumers[0].cs_qlength / 2);
 +#ifdef SQUEUE_STAT
 +      if (result)
 +              squeue->stat_paused++;
 +#endif
 +      return result;
 +}
 +
 +
 +int
 +SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
 +                                                        Tuplestorestate **tuplestore)
 +{
 +      SQueueSync *sqsync = squeue->sq_sync;
 +      TupleTableSlot *tmpslot = NULL;
 +      int                     i;
 +      int                     nstores = 0;
 +
 +      elog(DEBUG1, "SQueue %s, finishing the SQueue - producer node %d, "
 +                      "pid %d, nconsumers %d", squeue->sq_key, squeue->sq_nodeid,
 +                      squeue->sq_pid, squeue->sq_nconsumers);
 +
 +      for (i = 0; i < squeue->sq_nconsumers; i++)
 +      {
 +              ConsState *cstate = &squeue->sq_consumers[i];
 +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
 +#ifdef SQUEUE_STAT
 +              if (!squeue->stat_finish)
 +                      elog(DEBUG1, "Finishing %s node %d, %ld writes and %ld reads so far, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer",
 +                               squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns);
 +#endif
 +              elog(DEBUG1, "SQueue %s finishing, consumer at %d, consumer node %d, pid %d, "
 +                              "status %d", squeue->sq_key, i,
 +                              cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +              /*
 +               * if the tuplestore has data and consumer queue has space for some
 +               * try to push rows to the queue. We do not want to do that often
 +               * to avoid overhead of temp tuple slot allocation.
 +               */
 +              if (tuplestore[i])
 +              {
 +                      /* If the consumer is not reading just destroy the tuplestore */
 +                      if (cstate->cs_status != CONSUMER_ACTIVE)
 +                      {
 +                              tuplestore_end(tuplestore[i]);
 +                              tuplestore[i] = NULL;
 +                      }
 +                      else
 +                      {
 +                              nstores++;
 +                              /*
 +                               * Attempt to dump tuples from the store require tuple slot
 +                               * allocation, that is not a cheap operation, so proceed if
 +                               * target queue has enough space.
 +                               */
 +                              if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
 +                              {
 +                                      if (tmpslot == NULL)
 +                                              tmpslot = MakeSingleTupleTableSlot(tupDesc);
 +                                      if (SharedQueueDump(squeue, i, tmpslot, tuplestore[i]))
 +                                      {
 +                                              tuplestore_end(tuplestore[i]);
 +                                              tuplestore[i] = NULL;
 +                                              cstate->cs_status = CONSUMER_EOF;
 +                                              nstores--;
 +                                      }
 +                                      /* Consumer may be sleeping, wake it up */
 +                                      SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 +                              }
 +                      }
 +              }
 +              else
 +              {
 +                      /* it set eof if not yet set */
 +                      if (cstate->cs_status == CONSUMER_ACTIVE)
 +                      {
 +                              cstate->cs_status = CONSUMER_EOF;
 +                              SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 +                      }
 +              }
 +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +      }
 +      if (tmpslot)
 +              ExecDropSingleTupleTableSlot(tmpslot);
 +
 +#ifdef SQUEUE_STAT
 +      squeue->stat_finish = true;
 +#endif
 +
 +      return nstores;
 +}
 +
 +
 +/*
 + * SharedQueueUnBind
 + *    Cancel binding of current process to the shared queue. If the process
 + * was a producer it should pass in the array of tuplestores where tuples were
 + * queueed when it was unsafe to block. If any of the tuplestores holds data
 + * rows they are written to the queue. The length of the array of the
 + * tuplestores should be the same as the count of consumers. It is OK if some
 + * entries are NULL. When a consumer unbinds from the shared queue it should
 + * set the tuplestore parameter to NULL.
 + */
 +void
 +SharedQueueUnBind(SharedQueue squeue, bool failed)
 +{
 +      SQueueSync *sqsync = squeue->sq_sync;
 +      int                     wait_result = 0;
 +      int         i                = 0;
 +      int         consumer_running = 0;
 +
 +      elog(DEBUG1, "SQueue %s, unbinding the SQueue (failed: %c) - producer node %d, "
 +                      "pid %d, nconsumers %d", squeue->sq_key, failed ? 'T' : 'F',
 +                      squeue->sq_nodeid, squeue->sq_pid, squeue->sq_nconsumers);
 +
 +CHECK:
 +
 +      /* loop while there are active consumers */
 +      for (;;)
 +      {
 +              int i;
 +              int c_count = 0;
 +              int unbound_count = 0;
 +
 +              /* check queue states */
 +              for (i = 0; i < squeue->sq_nconsumers; i++)
 +              {
 +                      ConsState *cstate = &squeue->sq_consumers[i];
 +                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
 +
 +                      elog(DEBUG1, "SQueue %s unbinding, check consumer at %d, consumer node %d, pid %d, "
 +                                      "status %d", squeue->sq_key, i,
 +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +
 +                      /* is consumer working yet ? */
 +                      if (cstate->cs_status == CONSUMER_ACTIVE && failed)
 +                      {
 +                              elog(DEBUG1, "SQueue %s, consumer status CONSUMER_ACTIVE, but "
 +                                              "the operation has failed - marking CONSUMER_ERROR",
 +                                              squeue->sq_key);
 +
 +                              cstate->cs_status = CONSUMER_ERROR;
 +                      }
 +
 +                      if (cstate->cs_status != CONSUMER_DONE)
 +                      {
 +                              elog(DEBUG1, "SQueue %s, consumer not yet done, wake it up and "
 +                                              "wait for it to finish reading", squeue->sq_key);
 +                              c_count++;
 +                              /* Wake up consumer if it is sleeping */
 +                              SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 +                              /* producer will continue waiting */
 +                              ResetLatch(&sqsync->sqs_producer_latch);
 +
 +                              if (cstate->cs_pid == 0)
 +                                      unbound_count++;
 +                      }
 +
 +                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +              }
 +              if (c_count == 0)
 +                      break;
 +              elog(DEBUG1, "SQueue %s, wait while %d consumers finish, %d consumers"
 +                              "not yet bound", squeue->sq_key, c_count, unbound_count);
 +              /* wait for a notification */
 +              wait_result = WaitLatch(&sqsync->sqs_producer_latch,
 +                                                              WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT,
-                       WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
++                                                              10000L, WAIT_EVENT_MQ_INTERNAL);
 +              if (wait_result & WL_TIMEOUT)
 +              {
 +                      elog(WARNING, "SQueue %s, timeout while waiting for Consumers "
 +                                      "finishing", squeue->sq_key);
 +                      break;
 +              }
 +              /* got notification, continue loop */
 +      }
 +#ifdef SQUEUE_STAT
 +      elog(DEBUG1, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused);
 +#endif
 +      elog(DEBUG1, "SQueue %s, producer node %d, pid %d - unbound successfully",
 +                      squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid);
 +
 +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
 +
 +      /*
 +       * In rear situation, after consumers just bind to the shared queue, the producer timeout and remove the shared queue.
 +       * This will cause a SEGV in the consumer. So here recheck if there are some consumers binded to the queue, if so, we need to wait them to 
 +       * finish.
 +       */
 +      consumer_running = 0;
 +      for (i = 0; i < squeue->sq_nconsumers; i++)
 +      {
 +              ConsState *cstate = &squeue->sq_consumers[i];
 +
 +              LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
 +
 +              /* found a consumer running */
 +              if (CONSUMER_ACTIVE == cstate->cs_status && cstate->cs_pid != 0)
 +              {
 +                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d, "
 +                                      "started running after we finished unbind", squeue->sq_key,
 +                                      cstate->cs_node, cstate->cs_pid, cstate->cs_status);
 +                      consumer_running++;
 +              }
 +
 +              LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +      }
 +
 +      if (consumer_running)
 +      {
 +              elog(DEBUG1, "SQueue %s have %d consumers started running after we "
 +                              "unbound, recheck now", squeue->sq_key, consumer_running);
 +              LWLockRelease(SQueuesLock);
 +              goto CHECK;
 +      }
 +
 +      /* All is done, clean up */
 +      DisownLatch(&sqsync->sqs_producer_latch);
 +
 +      if (--squeue->sq_refcnt == 0)
 +      {
 +              /* Now it is OK to remove hash table entry */
 +              squeue->sq_sync = NULL;
 +              sqsync->queue = NULL;
 +              if (hash_search(SharedQueues, squeue->sq_key, HASH_REMOVE, NULL) != squeue)
 +                      elog(PANIC, "Shared queue data corruption");
 +      }
 +
 +      LWLockRelease(SQueuesLock);
 +}
 +
 +
 +/*
 + * If queue with specified name still exists set mark respective consumer as
 + * "Done". Due to executor optimization consumer may never connect the queue,
 + * and should allow producer to finish it up if it is known the consumer will
 + * never connect.
 + */
 +void
 +SharedQueueRelease(const char *sqname)
 +{
 +      bool                                    found;
 +      volatile SharedQueue    sq;
 +
 +      LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
 +
 +      sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
 +      if (found)
 +      {
 +              volatile SQueueSync    *sqsync = sq->sq_sync;
 +              int                                             i;
 +
 +              Assert(sqsync && sqsync->queue == sq);
 +
 +              elog(DEBUG1, "SQueue %s producer node %d, pid %d  - requested to release",
 +                              sqname, sq->sq_nodeid, sq->sq_pid);
 +
 +              /*
 +               * If the SharedQ is not bound, we can't just remove it because
 +               * somebody might have just created a fresh entry and is going to bind
 +               * to it soon. We assume that the future producer will eventually
 +               * release the SharedQ
 +               */
 +              if (sq->sq_nodeid == -1)
 +              {
 +                      elog(DEBUG1, "SQueue %s, producer not bound ", sqname);
 +                      goto done;
 +              }
 +
 +              /*
 +               * Do not bother releasing producer, all necessary work will be
 +               * done upon UnBind.
 +               */
 +              if (sq->sq_nodeid != PGXC_PARENT_NODE_ID)
 +              {
 +                      elog(DEBUG1, "SQueue %s, we are consumer from node %d", sqname,
 +                                      PGXC_PARENT_NODE_ID);
 +                      /* find specified node in the consumer lists */
 +                      for (i = 0; i < sq->sq_nconsumers; i++)
 +                      {
 +                              ConsState *cstate = &(sq->sq_consumers[i]);
 +                              if (cstate->cs_node == PGXC_PARENT_NODE_ID)
 +                              {
 +                                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
 +                                                                LW_EXCLUSIVE);
 +                                      elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, "
 +                                                      "status %d",  sq->sq_key, cstate->cs_node,
 +                                                      cstate->cs_pid, cstate->cs_status);
 +
 +                                      /*
 +                                       * If the consumer pid is not set, we are looking at a race
 +                                       * condition where the old producer (which supplied the
 +                                       * tuples to this remote datanode) may have finished and
 +                                       * marked all consumers as CONSUMER_EOF, the consumers
 +                                       * themeselves consumed all the tuples and marked
 +                                       * themselves as CONSUMER_DONE. The old producer in that
 +                                       * case may have actually removed the SharedQ from shared
 +                                       * memory. But if a new execution for this same portal
 +                                       * comes before the consumer sends a "Close Portal" message
 +                                       * (which subsequently calls this function), we may end up
 +                                       * corrupting state for the upcoming consumer for this new
 +                                       * execution of the portal.
 +                                       *
 +                                       * It seems best to just ignore the release call in such
 +                                       * cases.
 +                                       */
 +                                      if (cstate->cs_pid == 0)
 +                                      {
 +                                              elog(DEBUG1, "SQueue %s, consumer node %d, already released",
 +                                                      sq->sq_key, cstate->cs_node);
 +                                      }
 +                                      else if (cstate->cs_status != CONSUMER_DONE)
 +                                      {
 +                                              /* Inform producer the consumer have done the job */
 +                                              cstate->cs_status = CONSUMER_DONE;
 +                                              /* no need to receive notifications */
 +                                              if (cstate->cs_pid > 0)
 +                                              {
 +                                                      DisownLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 +                                                      cstate->cs_pid = 0;
 +                                              }
 +                                              /*
 +                                               * notify the producer, it may be waiting while
 +                                               * consumers are finishing
 +                                               */
 +                                              SetLatch(&sqsync->sqs_producer_latch);
 +                                              elog(DEBUG1, "SQueue %s, release consumer at %d, node "
 +                                                              "%d, pid %d, status %d ", sqname, i,
 +                                                              cstate->cs_node, cstate->cs_pid,
 +                                                              cstate->cs_status);
 +                                      }
 +                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +                                      /* exit */
 +                                      goto done;
 +                              }
 +                      }
 +
 +                      elog(DEBUG1, "SQueue %s, consumer from node %d never bound",
 +                                      sqname, PGXC_PARENT_NODE_ID);
 +                      /*
 +                       * The consumer was never bound. Find empty consumer slot and
 +                       * register node here to let producer know that the node will never
 +                       * be consuming.
 +                       */
 +                      for (i = 0; i < sq->sq_nconsumers; i++)
 +                      {
 +                              ConsState *cstate = &(sq->sq_consumers[i]);
 +                              if (cstate->cs_node == -1)
 +                              {
 +                                      LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
 +                                                                LW_EXCLUSIVE);
 +                                      /* Inform producer the consumer have done the job */
 +                                      cstate->cs_status = CONSUMER_DONE;
 +                                      SetLatch(&sqsync->sqs_producer_latch);
 +                                      elog(DEBUG1, "SQueue %s, consumer at %d marking as "
 +                                                      "CONSUMER_DONE", sqname, i);
 +                                      LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 +                              }
 +                      }
 +              }
 +      }
 +done:
 +      /*
 +       * If we are the last holder of the SQueue, remove it from the hash table
 +       * to avoid any leak
 +       */
 +      if (sq && --sq->sq_refcnt == 0)
 +      {
 +              /* Now it is OK to remove hash table entry */
 +              sq->sq_sync->queue = NULL;
 +              sq->sq_sync = NULL;
 +              if (hash_search(SharedQueues, sq->sq_key, HASH_REMOVE, NULL) != sq)
 +                      elog(PANIC, "Shared queue data corruption");
 +      }
 +      LWLockRelease(SQueuesLock);
 +}
 +
 +
 +/*
 + * Called when the backend is ending.
 + */
 +void
 +SharedQueuesCleanup(int code, Datum arg)
 +{
 +      /* Need to be able to look into catalogs */
 +      CurrentResourceOwner = ResourceOwnerCreate(NULL, "SharedQueuesCleanup");
 +
 +      /*
 +       * Release all registered prepared statements.
 +       * If a shared queue name is associated with the statement this queue will
 +       * be released.
 +       */
 +      DropAllPreparedStatements();
 +
 +      /* Release everything */
 +      ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, true, true);
 +      ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_LOCKS, true, true);
 +      ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_AFTER_LOCKS, true, true);
 +      CurrentResourceOwner = NULL;
 +}
 +
 +
 +/*
 + * sq_push_long_tuple
 + *    Routine to push through the consumer state tuple longer the the consumer
 + *    queue. Long tuple is written by a producer partially, and only when the
 + *    consumer queue is empty.
 + *    The consumer can determine that the tuple being read is long if the length
 + *    of the tuple which is read before data is exceeding queue length.
 + *      Consumers is switching to the long tuple mode and read in the portion of
 + *      data which is already in the queue. After reading in each portion of data
 + *    consumer sets cs_ntuples to LONG_TUPLE to indicate it is in long tuple
 + *    mode, and writes out number of already read bytes to the beginning of the
 + *    queue.
 + *    While Consumer is reading in tuple data Producer may work on other task:
 + *    execute query and send tuples to other Customers. If Producer sees the
 + *    LONG_TUPLE indicator it may write out next portion. The tuple remains
 + *    current in the tuplestore, and Producer just needs to read offset from
 + *    the buffer to know what part of data to write next.
 + *    After tuple is completely written the Producer is advancing to next tuple
 + *    and continue operation in normal mode.
 + */
 +static bool
 +sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow)
 +{
 +      if (cstate->cs_ntuples == 0)
 +      {
 +              /* the tuple is too big to fit the queue, start pushing it through */
 +              int len;
 +              /*
 +               * Output actual message size, to prepare consumer:
 +               * allocate memory and set up transmission.
 +               */
 +              QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
 +              /* Output as much as possible */
 +              len = cstate->cs_qlength - sizeof(int);
 +              Assert(datarow->msglen > len);
 +              QUEUE_WRITE(cstate, len, datarow->msg);
 +              cstate->cs_ntuples = 1;
 +              return false;
 +      }
 +      else
 +      {
 +              int offset;
 +              int     len;
 +
 +              /* Continue pushing through long tuple */
 +              Assert(cstate->cs_ntuples == LONG_TUPLE);
 +              /*
 +               * Consumer outputs number of bytes already read at the beginning of
 +               * the queue.
 +               */
 +              memcpy(&offset, cstate->cs_qstart, sizeof(int));
 +
 +              Assert(offset > 0 && offset < datarow->msglen);
 +
 +              /* remaining data */
 +              len = datarow->msglen - offset;
 +              /*
 +               * We are sending remaining lengs just for sanity check at the consumer
 +               * side
 +               */
 +              QUEUE_WRITE(cstate, sizeof(int), (char *) &len);
 +              if (len > cstate->cs_qlength - sizeof(int))
 +              {
 +                      /* does not fit yet */
 +                      len = cstate->cs_qlength - sizeof(int);
 +                      QUEUE_WRITE(cstate, len, datarow->msg + offset);
 +                      cstate->cs_ntuples = 1;
 +                      return false;
 +              }
 +              else
 +              {
 +                      /* now we are done */
 +                      QUEUE_WRITE(cstate, len, datarow->msg + offset);
 +                      cstate->cs_ntuples = 1;
 +                      return true;
 +              }
 +      }
 +}
 +
 +
 +/*
 + * sq_pull_long_tuple
 + *    Read in from the queue data of a long tuple which does not the queue.
 + *    See sq_push_long_tuple for more details
 + */
 +static void
 +sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
 +                                                         ConsumerSync *sync)
 +{
 +      int offset = 0;
 +      int len = datarow->msglen;
 +
 +      for (;;)
 +      {
 +              /* determine how many bytes to read */
 +              if (len > cstate->cs_qlength - sizeof(int))
 +                      len = cstate->cs_qlength - sizeof(int);
 +
 +              /* read data */
 +              QUEUE_READ(cstate, len, datarow->msg + offset);
 +
 +              /* remember how many we read already */
 +              offset += len;
 +
 +              /* check if we are done */
 +              if (offset == datarow->msglen)
 +                      return;
 +
 +              /* need more, set up queue to accept data from the producer */
 +              Assert(cstate->cs_ntuples == 1); /* allow exactly one incomplete tuple */
 +              cstate->cs_ntuples = LONG_TUPLE; /* long tuple mode marker */
 +              /* Inform producer how many bytes we have already */
 +              memcpy(cstate->cs_qstart, &offset, sizeof(int));
 +              /* Release locks and wait until producer supply more data */
 +              while (cstate->cs_ntuples == LONG_TUPLE)
 +              {
 +                      /* prepare wait */
 +                      ResetLatch(&sync->cs_latch);
 +                      LWLockRelease(sync->cs_lwlock);
 +                      /* Wait for notification about available info */
++                      WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
++                                      WAIT_EVENT_MQ_INTERNAL);
 +                      /* got the notification, restore lock and try again */
 +                      LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE);
 +              }
 +              /* Read length of remaining data */
 +              QUEUE_READ(cstate, sizeof(int), (char *) &len);
 +
 +              /* Make sure we are doing the same tuple */
 +              Assert(offset + len == datarow->msglen);
 +
 +              /* next iteration */
 +      }
 +}
index 1b5328e479771d7b9da390b23c1e30061fb03876,89dd3b321bc91bbc176e2825e6af49556c94637e..45ae93b4ef27fb4dfc93dcd9280bd54006177e9a
@@@ -50,8 -50,7 +50,8 @@@
   * there is a window (caused by pgstat delay) on which a worker may choose a
   * table that was already vacuumed; this is a bug in the current design.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -2175,16 -2211,96 +2216,106 @@@ do_autovacuum(void
        heap_endscan(relScan);
        heap_close(classRel, AccessShareLock);
  
 +#ifdef XCP
 +      /*
 +       * Coordinator needs to access Datanodes to process distributed table.
 +       */
 +      if (IS_PGXC_COORDINATOR)
 +      {
 +              InitMultinodeExecutor(false);
 +      }
 +#endif
 +
+       /*
+        * Recheck orphan temporary tables, and if they still seem orphaned, drop
+        * them.  We'll eat a transaction per dropped table, which might seem
+        * excessive, but we should only need to do anything as a result of a
+        * previous backend crash, so this should not happen often enough to
+        * justify "optimizing".  Using separate transactions ensures that we
+        * don't bloat the lock table if there are many temp tables to be dropped,
+        * and it ensures that we don't lose work if a deletion attempt fails.
+        */
+       foreach(cell, orphan_oids)
+       {
+               Oid                     relid = lfirst_oid(cell);
+               Form_pg_class classForm;
+               int                     backendID;
+               ObjectAddress object;
+               /*
+                * Check for user-requested abort.
+                */
+               CHECK_FOR_INTERRUPTS();
+               /*
+                * Try to lock the table.  If we can't get the lock immediately,
+                * somebody else is using (or dropping) the table, so it's not our
+                * concern anymore.  Having the lock prevents race conditions below.
+                */
+               if (!ConditionalLockRelationOid(relid, AccessExclusiveLock))
+                       continue;
+               /*
+                * Re-fetch the pg_class tuple and re-check whether it still seems to
+                * be an orphaned temp table.  If it's not there or no longer the same
+                * relation, ignore it.
+                */
+               tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+               if (!HeapTupleIsValid(tuple))
+               {
+                       /* be sure to drop useless lock so we don't bloat lock table */
+                       UnlockRelationOid(relid, AccessExclusiveLock);
+                       continue;
+               }
+               classForm = (Form_pg_class) GETSTRUCT(tuple);
+               /*
+                * Make all the same tests made in the loop above.  In event of OID
+                * counter wraparound, the pg_class entry we have now might be
+                * completely unrelated to the one we saw before.
+                */
+               if (!((classForm->relkind == RELKIND_RELATION ||
+                          classForm->relkind == RELKIND_MATVIEW) &&
+                         classForm->relpersistence == RELPERSISTENCE_TEMP))
+               {
+                       UnlockRelationOid(relid, AccessExclusiveLock);
+                       continue;
+               }
+               backendID = GetTempNamespaceBackendId(classForm->relnamespace);
+               if (!(backendID != InvalidBackendId &&
+                         (backendID == MyBackendId ||
+                          BackendIdGetProc(backendID) == NULL)))
+               {
+                       UnlockRelationOid(relid, AccessExclusiveLock);
+                       continue;
+               }
+               /* OK, let's delete it */
+               ereport(LOG,
+                               (errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"",
+                                               get_database_name(MyDatabaseId),
+                                               get_namespace_name(classForm->relnamespace),
+                                               NameStr(classForm->relname))));
+               object.classId = RelationRelationId;
+               object.objectId = relid;
+               object.objectSubId = 0;
+               performDeletion(&object, DROP_CASCADE,
+                                               PERFORM_DELETION_INTERNAL |
+                                               PERFORM_DELETION_QUIETLY |
+                                               PERFORM_DELETION_SKIP_EXTENSIONS);
+               /*
+                * To commit the deletion, end current transaction and start a new
+                * one.  Note this also releases the lock we took.
+                */
+               CommitTransactionCommand();
+               StartTransactionCommand();
+               /* StartTransactionCommand changed current memory context */
+               MemoryContextSwitchTo(AutovacMemCxt);
+       }
        /*
         * Create a buffer access strategy object for VACUUM to use.  We want to
         * use the same one across all the vacuum operations we perform, since the
index 0eb039286358863899dae26df670b879ae05d61a,0000000000000000000000000000000000000000..6c6e8ebd9fc176b9a41626710d3ffa034f945059
mode 100644,000000..100644
--- /dev/null
@@@ -1,437 -1,0 +1,439 @@@
-                                          (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L));
 +/*-------------------------------------------------------------------------
 + *
 + * clustermon.c
 + *
 + * Postgres-XL Cluster Monitor
 + *
 + * Portions Copyright (c) 2015, 2ndQuadrant Ltd
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
 + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
 + * Portions Copyright (c) 1994, Regents of the University of California
 + *
 + *
 + * IDENTIFICATION
 + *      src/backend/postmaster/clustermon.c
 + *
 + *-------------------------------------------------------------------------
 + */
 +#include "postgres.h"
 +
 +#include <signal.h>
 +#include <sys/types.h>
 +#include <sys/time.h>
 +#include <unistd.h>
 +
 +#include "access/gtm.h"
 +#include "access/transam.h"
 +#include "access/xact.h"
 +#include "gtm/gtm_c.h"
 +#include "gtm/gtm_gxid.h"
 +#include "libpq/pqsignal.h"
 +#include "miscadmin.h"
 +#include "pgxc/pgxc.h"
 +#include "postmaster/clustermon.h"
 +#include "postmaster/fork_process.h"
 +#include "postmaster/postmaster.h"
 +#include "storage/ipc.h"
 +#include "storage/proc.h"
 +#include "storage/procarray.h"
 +#include "storage/spin.h"
 +#include "tcop/tcopprot.h"
 +#include "utils/memutils.h"
 +#include "utils/ps_status.h"
 +#include "utils/timeout.h"
 +#include "utils/timestamp.h"
++#include "pgstat.h"
 +
 +/* Flags to tell if we are in a clustermon process */
 +static bool am_clustermon = false;
 +
 +/* Flags set by signal handlers */
 +static volatile sig_atomic_t got_SIGHUP = false;
 +static volatile sig_atomic_t got_SIGTERM = false;
 +
 +/* Memory context for long-lived data */
 +static MemoryContext ClusterMonitorMemCxt;
 +static ClusterMonitorCtlData *ClusterMonitorCtl = NULL; 
 +
 +static void cm_sighup_handler(SIGNAL_ARGS);
 +static void cm_sigterm_handler(SIGNAL_ARGS);
 +static void ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin);
 +static void ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin);
 +
 +/* PID of clustser monitoring process */
 +int                   ClusterMonitorPid = 0;
 +
 +#define CLUSTER_MONITOR_NAPTIME       5
 +
 +/*
 + * Main loop for the cluster monitor process.
 + */
 +int
 +ClusterMonitorInit(void)
 +{
 +      sigjmp_buf      local_sigjmp_buf;
 +      GTM_PGXCNodeType nodetype = IS_PGXC_DATANODE ?
 +                                                                      GTM_NODE_DATANODE :
 +                                                                      GTM_NODE_COORDINATOR;
 +      GlobalTransactionId oldestXmin;
 +      GlobalTransactionId newOldestXmin;
 +      GlobalTransactionId lastGlobalXmin;
 +      GlobalTransactionId latestCompletedXid;
 +      int status;
 +
 +      am_clustermon = true;
 +
 +      /* Identify myself via ps */
 +      init_ps_display("cluster monitor process", "", "", "");
 +
 +      ereport(LOG,
 +                      (errmsg("cluster monitor started")));
 +
 +      if (PostAuthDelay)
 +              pg_usleep(PostAuthDelay * 1000000L);
 +
 +      /*
 +       * Set up signal handlers.  We operate on databases much like a regular
 +       * backend, so we use the same signal handling.  See equivalent code in
 +       * tcop/postgres.c.
 +       */
 +      pqsignal(SIGHUP, cm_sighup_handler);
 +      pqsignal(SIGINT, StatementCancelHandler);
 +      pqsignal(SIGTERM, cm_sigterm_handler);
 +
 +      pqsignal(SIGQUIT, quickdie);
 +      InitializeTimeouts();           /* establishes SIGALRM handler */
 +
 +      pqsignal(SIGPIPE, SIG_IGN);
 +      pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 +      pqsignal(SIGFPE, FloatExceptionHandler);
 +      pqsignal(SIGCHLD, SIG_DFL);
 +
 +      /*
 +       * Create a memory context that we will do all our work in.  We do this so
 +       * that we can reset the context during error recovery and thereby avoid
 +       * possible memory leaks.
 +       */
 +      ClusterMonitorMemCxt = AllocSetContextCreate(TopMemoryContext,
 +                                                                                "Cluster Monitor",
 +                                                                                ALLOCSET_DEFAULT_MINSIZE,
 +                                                                                ALLOCSET_DEFAULT_INITSIZE,
 +                                                                                ALLOCSET_DEFAULT_MAXSIZE);
 +      MemoryContextSwitchTo(ClusterMonitorMemCxt);
 +
 +    SetProcessingMode(NormalProcessing);
 +
 +      if (RegisterGTM(nodetype) < 0)
 +      {
 +              UnregisterGTM(nodetype);
 +              if (RegisterGTM(nodetype) < 0)
 +              {
 +                      ereport(LOG,
 +                                      (errcode(ERRCODE_IO_ERROR),
 +                                       errmsg("Can not register node on GTM")));
 +              }
 +      }
 +
 +      /*
 +       * If an exception is encountered, processing resumes here.
 +       *
 +       * This code is a stripped down version of PostgresMain error recovery.
 +       */
 +      if (sigsetjmp(local_sigjmp_buf, 1) != 0)
 +      {
 +              /* since not using PG_TRY, must reset error stack by hand */
 +              error_context_stack = NULL;
 +
 +              /* Prevents interrupts while cleaning up */
 +              HOLD_INTERRUPTS();
 +
 +              /* Forget any pending QueryCancel or timeout request */
 +              disable_all_timeouts(false);
 +              QueryCancelPending = false;             /* second to avoid race condition */
 +
 +              /* Report the error to the server log */
 +              EmitErrorReport();
 +
 +              /*
 +               * Now return to normal top-level context and clear ErrorContext for
 +               * next time.
 +               */
 +              MemoryContextSwitchTo(ClusterMonitorMemCxt);
 +              FlushErrorState();
 +
 +              /* Flush any leaked data in the top-level context */
 +              MemoryContextResetAndDeleteChildren(ClusterMonitorMemCxt);
 +
 +              /* Now we can allow interrupts again */
 +              RESUME_INTERRUPTS();
 +
 +              /* if in shutdown mode, no need for anything further; just go away */
 +              if (got_SIGTERM)
 +                      goto shutdown;
 +
 +              /*
 +               * Sleep at least 1 second after any error.  We don't want to be
 +               * filling the error logs as fast as we can.
 +               */
 +              pg_usleep(1000000L);
 +      }
 +
 +      /* We can now handle ereport(ERROR) */
 +      PG_exception_stack = &local_sigjmp_buf;
 +
 +      /* must unblock signals before calling rebuild_database_list */
 +      PG_SETMASK(&UnBlockSig);
 +
 +      /*
 +       * Force statement_timeout and lock_timeout to zero to avoid letting these
 +       * settings prevent regular maintenance from being executed.
 +       */
 +      SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
 +      SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
 +
 +      /* loop until shutdown request */
 +      while (!got_SIGTERM)
 +      {
 +              struct timeval nap;
 +              int                     rc;
 +
 +              /*
 +               * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
 +               */
 +              nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
 +              nap.tv_usec = 0;
 +
 +              /*
 +               * Wait until naptime expires or we get some type of signal (all the
 +               * signal handlers will wake us by calling SetLatch).
 +               */
 +              rc = WaitLatch(MyLatch,
 +                                         WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-               oldestXmin = GetOldestXminInternal(NULL, false, true, lastGlobalXmin);
++                                         (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
++                                         WAIT_EVENT_CLUSTER_MONITOR_MAIN);
 +
 +              ResetLatch(MyLatch);
 +
 +              /* Process sinval catchup interrupts that happened while sleeping */
 +              ProcessCatchupInterrupt();
 +
 +              /*
 +               * Emergency bailout if postmaster has died.  This is to avoid the
 +               * necessity for manual cleanup of all postmaster children.
 +               */
 +              if (rc & WL_POSTMASTER_DEATH)
 +                      proc_exit(1);
 +
 +              /* the normal shutdown case */
 +              if (got_SIGTERM)
 +                      break;
 +
 +              if (got_SIGHUP)
 +              {
 +                      got_SIGHUP = false;
 +                      ProcessConfigFile(PGC_SIGHUP);
 +              }
 +
 +              /*
 +               * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
 +               * interval. Keep doing this forever
 +               */
 +              lastGlobalXmin = ClusterMonitorGetGlobalXmin();
 +              LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
++              oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
 +              ClusterMonitorSetReportingGlobalXmin(oldestXmin);
 +              LWLockRelease(ClusterMonitorLock);
 +
 +              if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
 +                                              &latestCompletedXid)))
 +              {
 +                      elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin "
 +                                      "- reported RecentGlobalXmin %d, received "
 +                                      "RecentGlobalXmin %d, " "received latestCompletedXid %d",
 +                                      status, oldestXmin, newOldestXmin,
 +                                      latestCompletedXid);
 +                      if (status == GTM_ERRCODE_TOO_OLD_XMIN ||
 +                              status == GTM_ERRCODE_NODE_EXCLUDED)
 +                      {
 +                              /*
 +                               * If we haven't seen a new transaction for a very long time or
 +                               * were disconncted for a while or excluded from the xmin
 +                               * computation for any reason, our xmin calculation could be
 +                               * well in the past, especially because its capped by the
 +                               * latestCompletedXid which may not advance on an idle server.
 +                               * In such cases, use the value of latestCompletedXid as
 +                               * returned by GTM and then recompute local xmin.
 +                               *
 +                               * If the GTM's global xmin advances even further while we are
 +                               * ready with a new xmin, just repeat the entire exercise as
 +                               * long as GTM keeps returning us a more current value of
 +                               * latestCompletedXid and thus pushing forward our local xmin
 +                               * calculation
 +                               */
 +                              if (GlobalTransactionIdIsValid(latestCompletedXid) &&
 +                                              TransactionIdPrecedes(oldestXmin, latestCompletedXid))
 +                              {
 +                                      SetLatestCompletedXid(latestCompletedXid);
 +                                      continue;
 +                              }
 +                      }
 +              }
 +              else
 +              {
 +                      elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %d,"
 +                                      "received RecentGlobalXmin %d, "
 +                                      "received latestCompletedXid %d", oldestXmin,
 +                                      newOldestXmin, latestCompletedXid);
 +
 +                      SetLatestCompletedXid(latestCompletedXid);
 +                      ClusterMonitorSetReportedGlobalXmin(oldestXmin);
 +                      if (GlobalTransactionIdIsValid(newOldestXmin))
 +                              ClusterMonitorSetGlobalXmin(newOldestXmin);
 +              }
 +
 +              ClusterMonitorSetReportingGlobalXmin(InvalidGlobalTransactionId);
 +
 +      }
 +
 +      /* Normal exit from the cluster monitor is here */
 +shutdown:
 +      UnregisterGTM(nodetype);
 +      ereport(LOG,
 +                      (errmsg("cluster monitor shutting down")));
 +
 +      proc_exit(0);                           /* done */
 +}
 +
 +/* SIGHUP: set flag to re-read config file at next convenient time */
 +static void
 +cm_sighup_handler(SIGNAL_ARGS)
 +{
 +      int                     save_errno = errno;
 +
 +      got_SIGHUP = true;
 +      SetLatch(MyLatch);
 +
 +      errno = save_errno;
 +}
 +
 +/* SIGTERM: time to die */
 +static void
 +cm_sigterm_handler(SIGNAL_ARGS)
 +{
 +      int                     save_errno = errno;
 +
 +      got_SIGTERM = true;
 +      SetLatch(MyLatch);
 +
 +      errno = save_errno;
 +}
 +
 +
 +/*
 + * IsClusterMonitor functions
 + *            Return whether this is either a cluster monitor process or a worker
 + *            process.
 + */
 +bool
 +IsClusterMonitorProcess(void)
 +{
 +      return am_clustermon;
 +}
 +
 +/* Report shared-memory space needed by ClusterMonitor */
 +Size
 +ClusterMonitorShmemSize(void)
 +{
 +      return sizeof (ClusterMonitorCtlData);
 +}
 +
 +void
 +ClusterMonitorShmemInit(void)
 +{
 +      bool            found;
 +
 +      ClusterMonitorCtl = (ClusterMonitorCtlData *)
 +              ShmemInitStruct("Cluster Monitor Ctl", ClusterMonitorShmemSize(), &found);
 +
 +      if (!found)
 +      {
 +              /* First time through, so initialize */
 +              MemSet(ClusterMonitorCtl, 0, ClusterMonitorShmemSize());
 +              SpinLockInit(&ClusterMonitorCtl->mutex);
 +      }
 +}
 +
 +GlobalTransactionId
 +ClusterMonitorGetGlobalXmin(void)
 +{
 +      GlobalTransactionId xmin;
 +
 +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
 +      xmin = ClusterMonitorCtl->gtm_recent_global_xmin;
 +      SpinLockRelease(&ClusterMonitorCtl->mutex);
 +
 +      return xmin;
 +}
 +
 +void
 +ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin)
 +{
 +      /*
 +       * First extend the commit logs. Even though we may not have actually
 +       * started any transactions in the new range, we must still extend the logs
 +       * so that later operations which rely on the RecentGlobalXmin to truncate
 +       * the logs work correctly.
 +       */
 +      ExtendLogs(xmin);
 +
 +      LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 +
 +      /*
 +       * Do a consistency check to ensure that we NEVER have running transactions
 +       * with xmin less than what the GTM has already computed. While during
 +       * normal execution, this should never happen, if we ever been excluded
 +       * from the xmin calculation by the GTM while we are still running old
 +       * transactions, PANIC is our best bet to avoid corruption
 +       */ 
 +      ProcArrayCheckXminConsistency(xmin);
 +
 +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
 +      ClusterMonitorCtl->gtm_recent_global_xmin = xmin;
 +      SpinLockRelease(&ClusterMonitorCtl->mutex);
 +
 +      LWLockRelease(ProcArrayLock);
 +}
 +
 +static void
 +ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin)
 +{
 +      elog(DEBUG2, "ClusterMonitorSetReportedGlobalXmin - old %d, new %d",
 +                      ClusterMonitorCtl->reported_recent_global_xmin,
 +                      xmin);
 +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
 +      ClusterMonitorCtl->reported_recent_global_xmin = xmin;
 +      SpinLockRelease(&ClusterMonitorCtl->mutex);
 +}
 +
 +static void
 +ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin)
 +{
 +      elog(DEBUG2, "ClusterMonitorSetReportingGlobalXmin - old %d, new %d",
 +                      ClusterMonitorCtl->reporting_recent_global_xmin,
 +                      xmin);
 +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
 +      ClusterMonitorCtl->reporting_recent_global_xmin = xmin;
 +      SpinLockRelease(&ClusterMonitorCtl->mutex);
 +}
 +
 +GlobalTransactionId
 +ClusterMonitorGetReportingGlobalXmin(void)
 +{
 +      GlobalTransactionId reporting_xmin;
 +
 +      SpinLockAcquire(&ClusterMonitorCtl->mutex);
 +      reporting_xmin = ClusterMonitorCtl->reporting_recent_global_xmin;
 +      SpinLockRelease(&ClusterMonitorCtl->mutex);
 +
 +      return reporting_xmin;
 +}
index 181f14ee7444d2d153b595163faf8c99f5403bdb,f453dade6c63c77ff7a5f69709a440971fad169b..008502e48c4eef352fc022940e16682e68d238c9
@@@ -11,8 -11,7 +11,8 @@@
   *                    - Add a pgstat config column to pg_database, so this
   *                      entire thing can be enabled/disabled on a per db basis.
   *
-  *    Copyright (c) 2001-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  *    Copyright (c) 2001-2017, PostgreSQL Global Development Group
   *
   *    src/backend/postmaster/pgstat.c
   * ----------
@@@ -3262,6 -3453,421 +3514,424 @@@ pgstat_get_wait_event(uint32 wait_event
        return event_name;
  }
  
+ /* ----------
+  * pgstat_get_wait_activity() -
+  *
+  * Convert WaitEventActivity to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_activity(WaitEventActivity w)
+ {
+       const char *event_name = "unknown wait event";
+       switch (w)
+       {
+               case WAIT_EVENT_ARCHIVER_MAIN:
+                       event_name = "ArchiverMain";
+                       break;
+               case WAIT_EVENT_AUTOVACUUM_MAIN:
+                       event_name = "AutoVacuumMain";
+                       break;
+               case WAIT_EVENT_BGWRITER_HIBERNATE:
+                       event_name = "BgWriterHibernate";
+                       break;
+               case WAIT_EVENT_BGWRITER_MAIN:
+                       event_name = "BgWriterMain";
+                       break;
+               case WAIT_EVENT_CHECKPOINTER_MAIN:
+                       event_name = "CheckpointerMain";
+                       break;
+               case WAIT_EVENT_PGSTAT_MAIN:
+                       event_name = "PgStatMain";
+                       break;
+               case WAIT_EVENT_RECOVERY_WAL_ALL:
+                       event_name = "RecoveryWalAll";
+                       break;
+               case WAIT_EVENT_RECOVERY_WAL_STREAM:
+                       event_name = "RecoveryWalStream";
+                       break;
+               case WAIT_EVENT_SYSLOGGER_MAIN:
+                       event_name = "SysLoggerMain";
+                       break;
+               case WAIT_EVENT_WAL_RECEIVER_MAIN:
+                       event_name = "WalReceiverMain";
+                       break;
+               case WAIT_EVENT_WAL_SENDER_MAIN:
+                       event_name = "WalSenderMain";
+                       break;
+               case WAIT_EVENT_WAL_WRITER_MAIN:
+                       event_name = "WalWriterMain";
+                       break;
+               case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN:
+                       event_name = "LogicalLauncherMain";
+                       break;
+               case WAIT_EVENT_LOGICAL_APPLY_MAIN:
+                       event_name = "LogicalApplyMain";
+                       break;
++              case WAIT_EVENT_CLUSTER_MONITOR_MAIN:
++                      event_name = "ClusterMonitorMain";
++                      break;
+                       /* no default case, so that compiler will warn */
+       }
+       return event_name;
+ }
+ /* ----------
+  * pgstat_get_wait_client() -
+  *
+  * Convert WaitEventClient to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_client(WaitEventClient w)
+ {
+       const char *event_name = "unknown wait event";
+       switch (w)
+       {
+               case WAIT_EVENT_CLIENT_READ:
+                       event_name = "ClientRead";
+                       break;
+               case WAIT_EVENT_CLIENT_WRITE:
+                       event_name = "ClientWrite";
+                       break;
+               case WAIT_EVENT_SSL_OPEN_SERVER:
+                       event_name = "SSLOpenServer";
+                       break;
+               case WAIT_EVENT_WAL_RECEIVER_WAIT_START:
+                       event_name = "WalReceiverWaitStart";
+                       break;
+               case WAIT_EVENT_LIBPQWALRECEIVER:
+                       event_name = "LibPQWalReceiver";
+                       break;
+               case WAIT_EVENT_WAL_SENDER_WAIT_WAL:
+                       event_name = "WalSenderWaitForWAL";
+                       break;
+               case WAIT_EVENT_WAL_SENDER_WRITE_DATA:
+                       event_name = "WalSenderWriteData";
+                       break;
+                       /* no default case, so that compiler will warn */
+       }
+       return event_name;
+ }
+ /* ----------
+  * pgstat_get_wait_ipc() -
+  *
+  * Convert WaitEventIPC to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_ipc(WaitEventIPC w)
+ {
+       const char *event_name = "unknown wait event";
+       switch (w)
+       {
+               case WAIT_EVENT_BGWORKER_SHUTDOWN:
+                       event_name = "BgWorkerShutdown";
+                       break;
+               case WAIT_EVENT_BGWORKER_STARTUP:
+                       event_name = "BgWorkerStartup";
+                       break;
+               case WAIT_EVENT_BTREE_PAGE:
+                       event_name = "BtreePage";
+                       break;
+               case WAIT_EVENT_EXECUTE_GATHER:
+                       event_name = "ExecuteGather";
+                       break;
+               case WAIT_EVENT_MQ_INTERNAL:
+                       event_name = "MessageQueueInternal";
+                       break;
+               case WAIT_EVENT_MQ_PUT_MESSAGE:
+                       event_name = "MessageQueuePutMessage";
+                       break;
+               case WAIT_EVENT_MQ_RECEIVE:
+                       event_name = "MessageQueueReceive";
+                       break;
+               case WAIT_EVENT_MQ_SEND:
+                       event_name = "MessageQueueSend";
+                       break;
+               case WAIT_EVENT_PARALLEL_FINISH:
+                       event_name = "ParallelFinish";
+                       break;
+               case WAIT_EVENT_PARALLEL_BITMAP_SCAN:
+                       event_name = "ParallelBitmapScan";
+                       break;
+               case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
+                       event_name = "ProcArrayGroupUpdate";
+                       break;
+               case WAIT_EVENT_SAFE_SNAPSHOT:
+                       event_name = "SafeSnapshot";
+                       break;
+               case WAIT_EVENT_SYNC_REP:
+                       event_name = "SyncRep";
+                       break;
+               case WAIT_EVENT_LOGICAL_SYNC_DATA:
+                       event_name = "LogicalSyncData";
+                       break;
+               case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE:
+                       event_name = "LogicalSyncStateChange";
+                       break;
+                       /* no default case, so that compiler will warn */
+       }
+       return event_name;
+ }
+ /* ----------
+  * pgstat_get_wait_timeout() -
+  *
+  * Convert WaitEventTimeout to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_timeout(WaitEventTimeout w)
+ {
+       const char *event_name = "unknown wait event";
+       switch (w)
+       {
+               case WAIT_EVENT_BASE_BACKUP_THROTTLE:
+                       event_name = "BaseBackupThrottle";
+                       break;
+               case WAIT_EVENT_PG_SLEEP:
+                       event_name = "PgSleep";
+                       break;
+               case WAIT_EVENT_RECOVERY_APPLY_DELAY:
+                       event_name = "RecoveryApplyDelay";
+                       break;
+                       /* no default case, so that compiler will warn */
+       }
+       return event_name;
+ }
+ /* ----------
+  * pgstat_get_wait_io() -
+  *
+  * Convert WaitEventIO to string.
+  * ----------
+  */
+ static const char *
+ pgstat_get_wait_io(WaitEventIO w)
+ {
+       const char *event_name = "unknown wait event";
+       switch (w)
+       {
+               case WAIT_EVENT_BUFFILE_READ:
+                       event_name = "BufFileRead";
+                       break;
+               case WAIT_EVENT_BUFFILE_WRITE:
+                       event_name = "BufFileWrite";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_READ:
+                       event_name = "ControlFileRead";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_SYNC:
+                       event_name = "ControlFileSync";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE:
+                       event_name = "ControlFileSyncUpdate";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_WRITE:
+                       event_name = "ControlFileWrite";
+                       break;
+               case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE:
+                       event_name = "ControlFileWriteUpdate";
+                       break;
+               case WAIT_EVENT_COPY_FILE_READ:
+                       event_name = "CopyFileRead";
+                       break;
+               case WAIT_EVENT_COPY_FILE_WRITE:
+                       event_name = "CopyFileWrite";
+                       break;
+               case WAIT_EVENT_DATA_FILE_EXTEND:
+                       event_name = "DataFileExtend";
+                       break;
+               case WAIT_EVENT_DATA_FILE_FLUSH:
+                       event_name = "DataFileFlush";
+                       break;
+               case WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC:
+                       event_name = "DataFileImmediateSync";
+                       break;
+               case WAIT_EVENT_DATA_FILE_PREFETCH:
+                       event_name = "DataFilePrefetch";
+                       break;
+               case WAIT_EVENT_DATA_FILE_READ:
+                       event_name = "DataFileRead";
+                       break;
+               case WAIT_EVENT_DATA_FILE_SYNC:
+                       event_name = "DataFileSync";
+                       break;
+               case WAIT_EVENT_DATA_FILE_TRUNCATE:
+                       event_name = "DataFileTruncate";
+                       break;
+               case WAIT_EVENT_DATA_FILE_WRITE:
+                       event_name = "DataFileWrite";
+                       break;
+               case WAIT_EVENT_DSM_FILL_ZERO_WRITE:
+                       event_name = "DSMFillZeroWrite";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ:
+                       event_name = "LockFileAddToDataDirRead";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC:
+                       event_name = "LockFileAddToDataDirSync";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE:
+                       event_name = "LockFileAddToDataDirWrite";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_CREATE_READ:
+                       event_name = "LockFileCreateRead";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_CREATE_SYNC:
+                       event_name = "LockFileCreateSync";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_CREATE_WRITE:
+                       event_name = "LockFileCreateWRITE";
+                       break;
+               case WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ:
+                       event_name = "LockFileReCheckDataDirRead";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC:
+                       event_name = "LogicalRewriteCheckpointSync";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC:
+                       event_name = "LogicalRewriteMappingSync";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE:
+                       event_name = "LogicalRewriteMappingWrite";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_SYNC:
+                       event_name = "LogicalRewriteSync";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE:
+                       event_name = "LogicalRewriteTruncate";
+                       break;
+               case WAIT_EVENT_LOGICAL_REWRITE_WRITE:
+                       event_name = "LogicalRewriteWrite";
+                       break;
+               case WAIT_EVENT_RELATION_MAP_READ:
+                       event_name = "RelationMapRead";
+                       break;
+               case WAIT_EVENT_RELATION_MAP_SYNC:
+                       event_name = "RelationMapSync";
+                       break;
+               case WAIT_EVENT_RELATION_MAP_WRITE:
+                       event_name = "RelationMapWrite";
+                       break;
+               case WAIT_EVENT_REORDER_BUFFER_READ:
+                       event_name = "ReorderBufferRead";
+                       break;
+               case WAIT_EVENT_REORDER_BUFFER_WRITE:
+                       event_name = "ReorderBufferWrite";
+                       break;
+               case WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ:
+                       event_name = "ReorderLogicalMappingRead";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_READ:
+                       event_name = "ReplicationSlotRead";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC:
+                       event_name = "ReplicationSlotRestoreSync";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_SYNC:
+                       event_name = "ReplicationSlotSync";
+                       break;
+               case WAIT_EVENT_REPLICATION_SLOT_WRITE:
+                       event_name = "ReplicationSlotWrite";
+                       break;
+               case WAIT_EVENT_SLRU_FLUSH_SYNC:
+                       event_name = "SLRUFlushSync";
+                       break;
+               case WAIT_EVENT_SLRU_READ:
+                       event_name = "SLRURead";
+                       break;
+               case WAIT_EVENT_SLRU_SYNC:
+                       event_name = "SLRUSync";
+                       break;
+               case WAIT_EVENT_SLRU_WRITE:
+                       event_name = "SLRUWrite";
+                       break;
+               case WAIT_EVENT_SNAPBUILD_READ:
+                       event_name = "SnapbuildRead";
+                       break;
+               case WAIT_EVENT_SNAPBUILD_SYNC:
+                       event_name = "SnapbuildSync";
+                       break;
+               case WAIT_EVENT_SNAPBUILD_WRITE:
+                       event_name = "SnapbuildWrite";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC:
+                       event_name = "TimelineHistoryFileSync";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE:
+                       event_name = "TimelineHistoryFileWrite";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_READ:
+                       event_name = "TimelineHistoryRead";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_SYNC:
+                       event_name = "TimelineHistorySync";
+                       break;
+               case WAIT_EVENT_TIMELINE_HISTORY_WRITE:
+                       event_name = "TimelineHistoryWrite";
+                       break;
+               case WAIT_EVENT_TWOPHASE_FILE_READ:
+                       event_name = "TwophaseFileRead";
+                       break;
+               case WAIT_EVENT_TWOPHASE_FILE_SYNC:
+                       event_name = "TwophaseFileSync";
+                       break;
+               case WAIT_EVENT_TWOPHASE_FILE_WRITE:
+                       event_name = "TwophaseFileWrite";
+                       break;
+               case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ:
+                       event_name = "WALSenderTimelineHistoryRead";
+                       break;
+               case WAIT_EVENT_WAL_BOOTSTRAP_SYNC:
+                       event_name = "WALBootstrapSync";
+                       break;
+               case WAIT_EVENT_WAL_BOOTSTRAP_WRITE:
+                       event_name = "WALBootstrapWrite";
+                       break;
+               case WAIT_EVENT_WAL_COPY_READ:
+                       event_name = "WALCopyRead";
+                       break;
+               case WAIT_EVENT_WAL_COPY_SYNC:
+                       event_name = "WALCopySync";
+                       break;
+               case WAIT_EVENT_WAL_COPY_WRITE:
+                       event_name = "WALCopyWrite";
+                       break;
+               case WAIT_EVENT_WAL_INIT_SYNC:
+                       event_name = "WALInitSync";
+                       break;
+               case WAIT_EVENT_WAL_INIT_WRITE:
+                       event_name = "WALInitWrite";
+                       break;
+               case WAIT_EVENT_WAL_READ:
+                       event_name = "WALRead";
+                       break;
+               case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN:
+                       event_name = "WALSyncMethodAssign";
+                       break;
+               case WAIT_EVENT_WAL_WRITE:
+                       event_name = "WALWrite";
+                       break;
+                       /* no default case, so that compiler will warn */
+       }
+       return event_name;
+ }
  /* ----------
   * pgstat_get_backend_current_activity() -
   *
index 520616e4496e129033bf260975dba3c518417a97,35b4ec88d35786508781a62d9c06b1f2be712ba7..f6f920e49343a06698fadae83dcaa5e2c22fca00
   *      clients.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
  #include "utils/dynamic_loader.h"
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
 +#ifdef PGXC
 +#include "utils/resowner.h"
 +#endif
  #include "utils/timeout.h"
+ #include "utils/varlena.h"
  
  #ifdef EXEC_BACKEND
  #include "storage/spin.h"
@@@ -1397,19 -1345,8 +1447,19 @@@ PostmasterMain(int argc, char *argv[]
        StartupStatus = STARTUP_RUNNING;
        pmState = PM_STARTUP;
  
 +#ifdef PGXC /* PGXC_COORD */
 +      oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 +
 +      /*
 +       * Initialize the Data Node connection pool
 +       */
 +      PgPoolerPID = StartPoolManager();
 +
 +      MemoryContextSwitchTo(oldcontext);
 +#endif /* PGXC */
 +
        /* Some workers may be scheduled to start now */
-       maybe_start_bgworker();
+       maybe_start_bgworkers();
  
        status = ServerLoop();
  
@@@ -1858,21 -1795,10 +1908,22 @@@ ServerLoop(void
                }
  
                /* If we have lost the stats collector, try to start a new one */
-               if (PgStatPID == 0 && pmState == PM_RUN)
+               if (PgStatPID == 0 &&
+                       (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
                        PgStatPID = pgstat_start();
  
 +#ifdef PGXC
 +              /* If we have lost the pooler, try to start a new one */
 +              if (PgPoolerPID == 0 && pmState == PM_RUN)
 +                      PgPoolerPID = StartPoolManager();
 +#endif /* PGXC */
 +
 +#ifdef XCP
 +              /* If we have lost the cluster monitor, try to start a new one */
 +              if (ClusterMonPID == 0 && pmState == PM_RUN)
 +                      ClusterMonPID = StartClusterMonitor();
 +#endif
 +
                /* If we have lost the archiver, try to start a new one. */
                if (PgArchPID == 0 && PgArchStartupAllowed())
                        PgArchPID = pgarch_start();
@@@ -2951,18 -2857,9 +3010,18 @@@ reaper(SIGNAL_ARGS
                                PgArchPID = pgarch_start();
                        if (PgStatPID == 0)
                                PgStatPID = pgstat_start();
 +#ifdef PGXC
 +                      if (PgPoolerPID == 0)
 +                              PgPoolerPID = StartPoolManager();
 +#endif /* PGXC */
 +
 +#ifdef XCP
 +                      if (ClusterMonPID == 0)
 +                              ClusterMonPID = StartClusterMonitor();
 +#endif
  
                        /* workers may be scheduled to start now */
-                       maybe_start_bgworker();
+                       maybe_start_bgworkers();
  
                        /* at this point we are really open for business */
                        ereport(LOG,
Simple merge
index 4fd96d6a8c265ca0a3ffdb9a3c1ea592089b5c0f,35ff8bb3b7cb4f7c500acefd2ae05d99a401253b..510f49fcc0654194fff3de460cf85f06d8e5cc52
@@@ -21,7 -21,7 +21,8 @@@
  #include "postgres.h"
  
  #include "access/sysattr.h"
+ #include "catalog/dependency.h"
 +#include "catalog/namespace.h"
  #include "catalog/pg_type.h"
  #include "commands/trigger.h"
  #include "foreign/fdwapi.h"
@@@ -1305,76 -1277,9 +1361,77 @@@ rewriteTargetListUD(Query *parsetree, R
        const char *attrname;
        TargetEntry *tle;
  
 +#ifdef PGXC
 +      List *var_list = NIL;
 +      ListCell *elt;
 +
 +      /*
 +       * In Postgres-XC, we need to evaluate quals of the parse tree and determine
 +       * if they are Coordinator quals. If they are, their attribute need to be
 +       * added to target list for evaluation. In case some are found, add them as
 +       * junks in the target list. The junk status will be used by remote UPDATE
 +       * planning to associate correct element to a clause.
 +       * For DELETE, having such columns in target list helps to evaluate Quals
 +       * correctly on Coordinator.
 +       * PGXCTODO: This list could be reduced to keep only in target list the
 +       * vars using Coordinator Quals.
 +       */
 +      if (IS_PGXC_COORDINATOR && parsetree->jointree)
 +              var_list = pull_qual_vars((Node *) parsetree->jointree, parsetree->resultRelation);
 +
 +      foreach(elt, var_list)
 +      {
 +              Form_pg_attribute att_tup;
 +              int numattrs = RelationGetNumberOfAttributes(target_relation);
 +
 +              var = (Var *) lfirst(elt);
 +              /* Bypass in case of extra target items like ctid */
 +              if (var->varattno < 1 || var->varattno > numattrs)
 +                      continue;
 +
 +
 +              att_tup = target_relation->rd_att->attrs[var->varattno - 1];
 +              tle = makeTargetEntry((Expr *) var,
 +                                                        list_length(parsetree->targetList) + 1,
 +                                                        pstrdup(NameStr(att_tup->attname)),
 +                                                        true);
 +
 +              parsetree->targetList = lappend(parsetree->targetList, tle);
 +      }
 +#endif
 +
 +#ifdef PGXC
 +      /*
 +       * If relation is non-replicated, we need also to identify the Datanode
 +       * from where tuple is fetched.
 +       */
 +      if (IS_PGXC_COORDINATOR &&
 +              !IsConnFromCoord() &&
 +              !IsLocatorReplicated(GetRelationLocType(RelationGetRelid(target_relation))) &&
 +              (target_relation->rd_rel->relkind == RELKIND_RELATION ||
 +               target_relation->rd_rel->relkind == RELKIND_MATVIEW))
 +      {
 +              var = makeVar(parsetree->resultRelation,
 +                                        XC_NodeIdAttributeNumber,
 +                                        INT4OID,
 +                                        -1,
 +                                        InvalidOid,
 +                                        0);
 +
 +              attrname = "xc_node_id";
 +
 +              tle = makeTargetEntry((Expr *) var,
 +                                                        list_length(parsetree->targetList) + 1,
 +                                                        pstrdup(attrname),
 +                                                        true);
 +
 +              parsetree->targetList = lappend(parsetree->targetList, tle);
 +      }
 +#endif
 +
        if (target_relation->rd_rel->relkind == RELKIND_RELATION ||
-               target_relation->rd_rel->relkind == RELKIND_MATVIEW)
+               target_relation->rd_rel->relkind == RELKIND_MATVIEW ||
+               target_relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
        {
                /*
                 * Emit CTID so that executor can find the row to update or delete.
@@@ -3886,184 -3629,3 +3952,194 @@@ QueryRewrite(Query *parsetree
  
        return results;
  }
-       ProcessUtility(cparsetree->utilityStmt, cquery.data, PROCESS_UTILITY_QUERY,
-                       NULL, NULL, false, NULL);
 +
 +#ifdef PGXC
 +/*
 + * Rewrite the CREATE TABLE AS and SELECT INTO queries as a
 + * INSERT INTO .. SELECT query. The target table must be created first using
 + * utility command processing. This takes care of creating the target table on
 + * all the Coordinators and the Datanodes.
 + */
 +List *
 +QueryRewriteCTAS(Query *parsetree)
 +{
 +      RangeVar *relation;
 +      CreateStmt *create_stmt;
++      PlannedStmt *wrapper;
 +      List *tableElts = NIL;
 +      StringInfoData cquery;
 +      ListCell *col;
 +      Query *cparsetree;
 +      List *raw_parsetree_list, *tlist;
 +      char *selectstr;
 +      CreateTableAsStmt *stmt;
 +      IntoClause *into;
 +      ListCell *lc;
 +
 +      if (parsetree->commandType != CMD_UTILITY ||
 +              !IsA(parsetree->utilityStmt, CreateTableAsStmt))
 +              elog(ERROR, "Unexpected commandType or intoClause is not set properly");
 +
 +      /* Get the target table */
 +      stmt = (CreateTableAsStmt *) parsetree->utilityStmt;
 +
 +      if (stmt->relkind == OBJECT_MATVIEW)
 +              return list_make1(parsetree);
 +
 +      relation = stmt->into->rel;
 +
 +      if (stmt->if_not_exists)
 +      {
 +              Oid                     nspid;
 +
 +              nspid = RangeVarGetCreationNamespace(stmt->into->rel);
 +
 +              if (get_relname_relid(stmt->into->rel->relname, nspid))
 +              {
 +                      ereport(NOTICE,
 +                                      (errcode(ERRCODE_DUPLICATE_TABLE),
 +                                       errmsg("relation \"%s\" already exists, skipping",
 +                                                      stmt->into->rel->relname)));
 +                      return NIL;
 +              }
 +      }
 +
 +      /* Start building a CreateStmt for creating the target table */
 +      create_stmt = makeNode(CreateStmt);
 +      create_stmt->relation = relation;
 +      create_stmt->islocal = stmt->islocal;
 +      create_stmt->if_not_exists = stmt->if_not_exists;
 +      into = stmt->into;
 +
 +      /* Obtain the target list of new table */
 +      Assert(IsA(stmt->query, Query));
 +      cparsetree = (Query *) stmt->query;
 +      tlist = cparsetree->targetList;
 +
 +      /*
 +       * Based on the targetList, populate the column information for the target
 +       * table. If a column name list was specified in CREATE TABLE AS, override
 +       * the column names derived from the query. (Too few column names are OK, too
 +       * many are not.).
 +       */
 +      lc = list_head(into->colNames);
 +      foreach(col, tlist)
 +      {
 +              TargetEntry *tle = (TargetEntry *)lfirst(col);
 +              ColumnDef   *coldef;
 +              TypeName    *typename;
 +
 +              /* Ignore junk columns from the targetlist */
 +              if (tle->resjunk)
 +                      continue;
 +
 +              coldef = makeNode(ColumnDef);
 +              typename = makeNode(TypeName);
 +
 +              /* Take the column name specified if any */
 +              if (lc)
 +              {
 +                      coldef->colname = strVal(lfirst(lc));
 +                      lc = lnext(lc);
 +              }
 +              else
 +                      coldef->colname = pstrdup(tle->resname);
 +
 +              coldef->inhcount = 0;
 +              coldef->is_local = true;
 +              coldef->is_not_null = false;
 +              coldef->raw_default = NULL;
 +              coldef->cooked_default = NULL;
 +              coldef->constraints = NIL;
 +
 +              /*
 +               * Set typeOid and typemod. The name of the type is derived while
 +               * generating query
 +               */
 +              typename->typeOid = exprType((Node *)tle->expr);
 +              typename->typemod = exprTypmod((Node *)tle->expr);
 +
 +              coldef->typeName = typename;
 +
 +              tableElts = lappend(tableElts, coldef);
 +      }
 +
 +      if (lc != NULL)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_SYNTAX_ERROR),
 +                               errmsg("CREATE TABLE AS specifies too many column names")));
 +
 +      /*
 +       * Set column information and the distribution mechanism (which will be
 +       * NULL for SELECT INTO and the default mechanism will be picked)
 +       */
 +      create_stmt->tableElts = tableElts;
 +      create_stmt->distributeby = stmt->into->distributeby;
 +      create_stmt->subcluster = stmt->into->subcluster;
 +
 +      create_stmt->tablespacename = stmt->into->tableSpaceName;
 +      create_stmt->oncommit = stmt->into->onCommit;
 +      create_stmt->options = stmt->into->options;
 +
 +      /*
 +       * Check consistency of arguments
 +       */
 +      if (create_stmt->oncommit != ONCOMMIT_NOOP
 +                      && create_stmt->relation->relpersistence != RELPERSISTENCE_TEMP)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
 +                               errmsg("ON COMMIT can only be used on temporary tables")));
 +
 +      /* Get a copy of the parsetree which we can freely modify  */
 +      cparsetree = copyObject(parsetree);
 +
 +      /*
 +       * Now build a utility statement in order to run the CREATE TABLE DDL on
 +       * the local and remote nodes. We keep others fields as it is since they
 +       * are ignored anyways by deparse_query.
 +       */
 +      cparsetree->commandType = CMD_UTILITY;
 +      cparsetree->utilityStmt = (Node *) create_stmt;
 +
 +      initStringInfo(&cquery);
 +      deparse_query(cparsetree, &cquery, NIL, false, false);
 +
++
++      /* finally, wrap it in a dummy PlannedStmt */
++      wrapper = makeNode(PlannedStmt);
++      wrapper->commandType = CMD_UTILITY;
++      wrapper->canSetTag = false;
++      wrapper->utilityStmt = (Node *) create_stmt;
++      wrapper->stmt_location = -1;
++      wrapper->stmt_len = -1;
++
 +      /* Finally, fire off the query to run the DDL */
-                       NULL, 0);
++      ProcessUtility(wrapper, cquery.data, PROCESS_UTILITY_QUERY,
++                      NULL, NULL, NULL, false, NULL);
 +
 +      /*
 +       * Now fold the CTAS statement into an INSERT INTO statement. The
 +       * utility is no more required.
 +       */
 +      parsetree->utilityStmt = NULL;
 +
 +      /* Get the SELECT query string */
 +      initStringInfo(&cquery);
 +      deparse_query((Query *)stmt->query, &cquery, NIL, false, false);
 +      selectstr = pstrdup(cquery.data);
 +
 +      /* Now, finally build the INSERT INTO statement */
 +      initStringInfo(&cquery);
 +
 +      appendStringInfo(&cquery, "INSERT INTO %s.%s",
 +                              quote_identifier(get_namespace_name(RangeVarGetCreationNamespace(relation))),
 +                              quote_identifier(relation->relname));
 +
 +      appendStringInfo(&cquery, " %s %s", selectstr,
 +                      into->skipData ? "LIMIT 0" : "");
 +
 +      raw_parsetree_list = pg_parse_query(cquery.data);
 +      return pg_analyze_and_rewrite(linitial(raw_parsetree_list), cquery.data,
++                      NULL, 0, NULL);
 +}
 +#endif
Simple merge
index 90239e6abf75ebec458bb418af3c6d56a0020d44,2109cbf8587fe68256b7f76bd23a02b0a6e5280d..b22edf00ecd130104bb47be466726aacfd873b3c
@@@ -3,8 -3,7 +3,8 @@@
   * bufmgr.c
   *      buffer manager interface routines
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
Simple merge
Simple merge
index 7887d82a6e269f34338e34f48afb267f854af934,2d1ed143e0b67da2344013d48af235fccd898255..f4a192efd40ce757418ea9c8315a780f7384dfad
@@@ -3,8 -3,7 +3,8 @@@
   * ipci.c
   *      POSTGRES inter-process communication initialization code.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "storage/procsignal.h"
  #include "storage/sinvaladt.h"
  #include "storage/spin.h"
 +#ifdef XCP
 +#include "pgxc/pgxc.h"
 +#include "pgxc/squeue.h"
 +#include "pgxc/pause.h"
 +#endif
+ #include "utils/backend_random.h"
  #include "utils/snapmgr.h"
  
 -
  shmem_startup_hook_type shmem_startup_hook = NULL;
  
  static Size total_addin_request = 0;
@@@ -147,21 -144,12 +154,23 @@@ CreateSharedMemoryAndSemaphores(bool ma
                size = add_size(size, ReplicationOriginShmemSize());
                size = add_size(size, WalSndShmemSize());
                size = add_size(size, WalRcvShmemSize());
 +#ifdef XCP
 +              if (IS_PGXC_DATANODE)
 +                      size = add_size(size, SharedQueueShmemSize());
 +              if (IS_PGXC_COORDINATOR)
 +                      size = add_size(size, ClusterLockShmemSize());
 +              size = add_size(size, ClusterMonitorShmemSize());
 +#endif
+               size = add_size(size, ApplyLauncherShmemSize());
                size = add_size(size, SnapMgrShmemSize());
                size = add_size(size, BTreeShmemSize());
                size = add_size(size, SyncScanShmemSize());
                size = add_size(size, AsyncShmemSize());
 +#ifdef PGXC
 +              size = add_size(size, NodeTablesShmemSize());
 +#endif
 +
+               size = add_size(size, BackendRandomShmemSize());
  #ifdef EXEC_BACKEND
                size = add_size(size, ShmemBackendArraySize());
  #endif
        ReplicationOriginShmemInit();
        WalSndShmemInit();
        WalRcvShmemInit();
+       ApplyLauncherShmemInit();
  
 +#ifdef XCP
 +      /*
 +       * Set up distributed executor's shared queues
 +       */
 +      if (IS_PGXC_DATANODE)
 +              SharedQueuesInit();
 +      if (IS_PGXC_COORDINATOR)
 +              ClusterLockShmemInit();
 +      ClusterMonitorShmemInit();
 +#endif
 +
        /*
         * Set up other modules that need some shared memory space
         */
        BTreeShmemInit();
        SyncScanShmemInit();
        AsyncShmemInit();
+       BackendRandomShmemInit();
  
 +#ifdef PGXC
 +      NodeTablesShmemInit();
 +#endif
 +
 +
  #ifdef EXEC_BACKEND
  
        /*
index a66cb2468d4682c7c23e060804c4b2154aa6f4ea,8a715367918cfa5e9c89165ca8dfbc2a0ec29a31..1c01dd973f9b45bd2af900d279f1d08f9d9e3c2f
   * happen, it would tie up KnownAssignedXids indefinitely, so we protect
   * ourselves by pruning the array when a valid list of running XIDs arrives.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -64,7 -53,7 +64,8 @@@
  #include "access/xlog.h"
  #include "catalog/catalog.h"
  #include "miscadmin.h"
 +#include "postmaster/clustermon.h"
+ #include "pgstat.h"
  #include "storage/proc.h"
  #include "storage/procarray.h"
  #include "storage/spin.h"
@@@ -1377,34 -1311,7 +1380,34 @@@ TransactionIdIsActive(TransactionId xid
   * GetOldestXmin() move backwards, with no consequences for data integrity.
   */
  TransactionId
- GetOldestXmin(Relation rel, bool ignoreVacuum)
+ GetOldestXmin(Relation rel, int flags)
 +{
-       return GetOldestXminInternal(rel, ignoreVacuum, false,
++      return GetOldestXminInternal(rel, flags, false,
 +                      InvalidTransactionId);
 +}
 +
 +/*
 + * This implements most of the logic that GetOldestXmin needs. In XL, we don't
 + * actually compute OldestXmin unless specifically told to do by computeLocal
 + * argument set to true which GetOldestXmin never done. So we just return the
 + * value from the shared memory. The OldestXmin itself is always computed by
 + * the Cluster Monitor process by sending local state information to the GTM,
 + * which then aggregates information from all the nodes and gives out final
 + * OldestXmin or GlobalXmin which is consistent across the entire cluster.
 + *
 + * In addition, Cluster Monitor also passes the last reported xmin (or the one
 + * sent back by GTM in case we were idle) and the last received GlobalXmin. We
 + * must ensure that we don't see an XID or xmin which is beyond these horizons.
 + * Otherwise it signals problems with the GlobalXmin calculation. This can
 + * happen because of network disconnects or extreme load on the machine
 + * (unlikely). In any case, we must restart ourselves to avoid any data
 + * consistency problem. A more careful approach could involve killing only
 + * those backends which are running with old xid or xmin. We can consider
 + * implementing it that way in future
 + */
 +TransactionId
- GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
++GetOldestXminInternal(Relation rel, int flags, bool computeLocal,
 +              TransactionId lastGlobalXmin)
  {
        ProcArrayStruct *arrayP = procArray;
        TransactionId result;
index 0d2d1b08435ac73acc5c9e6a17ae1a532a478581,4a21d5512d2370ac3d879bbb8dd7e197aae6d6ee..f4d4f25e68ccd38ac8c960f1c023d48b04e6b4cf
@@@ -4,8 -4,7 +4,8 @@@
   *      Routines for interprocess signalling
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
Simple merge
index 37eec5b00a491afb65ea243854bb81d70ccdfc16,4315be4077359b8e584804525250fb816847cbc5..34a4e913d71220aaebee034fb71a6412d134521e
@@@ -3,8 -3,7 +3,8 @@@
   * lock.c
   *      POSTGRES primary lock mechanism
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 950ea746498bf29cf790b61ad8fc1c533d734a00,35536e47894bd7f00fb57c14392f581bf87e4f60..655c05c7a7661b630a12523656a4757da9680ec8
@@@ -20,8 -20,7 +20,8 @@@
   * appropriate value for a free lock.  The meaning of the variable is up to
   * the caller, the lightweight lock code just assigns and compares it.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
  #include "storage/ipc.h"
  #include "storage/predicate.h"
  #include "storage/proc.h"
+ #include "storage/proclist.h"
  #include "storage/spin.h"
 +#ifdef XCP
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/squeue.h"
 +#endif
  #include "utils/memutils.h"
  
  #ifdef LWLOCK_STATS
@@@ -532,10 -494,10 +498,12 @@@ RegisterLWLockTranches(void
  
        if (LWLockTrancheArray == NULL)
        {
-               LWLockTranchesAllocated = 32;
-               LWLockTrancheArray = (LWLockTranche **)
 -              LWLockTranchesAllocated = 64;
++              LWLockTranchesAllocated = 128; /* XXX PG10MERGE: Not sure why 64 is
++                                                                                hardcoded in the PG10 branch. That
++                                                                                causes assertion failure */
+               LWLockTrancheArray = (char **)
                        MemoryContextAllocZero(TopMemoryContext,
-                                                 LWLockTranchesAllocated * sizeof(LWLockTranche *));
+                                                                  LWLockTranchesAllocated * sizeof(char *));
                Assert(LWLockTranchesAllocated >= LWTRANCHE_FIRST_USER_DEFINED);
        }
  
index b7c7c7d49c5133762dc4650263cb2cd424c845cb,e6025ecedb3ba34e7579a4115510ed294d4f8f6d..420a76217c891a3c489e93023f8994d729286aa0
@@@ -47,7 -47,6 +47,10 @@@ CommitTsLock                                         3
  ReplicationOriginLock                         40
  MultiXactTruncationLock                               41
  OldSnapshotTimeMapLock                                42
 -BackendRandomLock                                     43
 -LogicalRepWorkerLock                          44
 -CLogTruncationLock                                    45
 +BarrierLock                                                   43
 +NodeTableLock                                         44
 +SQueuesLock                                                   45
 +ClusterMonitorLock                                    46
++BackendRandomLock                                     47
++LogicalRepWorkerLock                          48
++CLogTruncationLock                                    49
Simple merge
index d876625166dd60ed3c4997a98402ffffb4dc45b4,3e716b1c6c7280038b457a5b355e96e61c7982b6..410c31fe9911f9f17e31e18886bebd068f977ef3
@@@ -3,8 -3,7 +3,8 @@@
   * proc.c
   *      routines to manage per-process shared memory data structure
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "access/twophase.h"
  #include "access/xact.h"
  #include "miscadmin.h"
+ #include "pgstat.h"
  #include "postmaster/autovacuum.h"
 +#ifdef PGXC
 +#include "pgxc/pgxc.h"
 +#include "pgxc/poolmgr.h"
 +#endif
  #include "replication/slot.h"
  #include "replication/syncrep.h"
+ #include "storage/condition_variable.h"
  #include "storage/standby.h"
  #include "storage/ipc.h"
  #include "storage/lmgr.h"
@@@ -377,15 -370,9 +375,16 @@@ InitProcess(void
        MyProc->backendId = InvalidBackendId;
        MyProc->databaseId = InvalidOid;
        MyProc->roleId = InvalidOid;
 +#ifdef XCP
 +      MyProc->coordId = InvalidOid;
 +      MyProc->coordPid = 0;
 +#endif
+       MyProc->isBackgroundWorker = IsBackgroundWorker;
        MyPgXact->delayChkpt = false;
        MyPgXact->vacuumFlags = 0;
 +#ifdef PGXC
 +      MyProc->isPooler = false;
 +#endif
        /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
        if (IsAutoVacuumWorkerProcess())
                MyPgXact->vacuumFlags |= PROC_IS_AUTOVACUUM;
@@@ -556,15 -543,7 +555,16 @@@ InitAuxiliaryProcess(void
        MyProc->backendId = InvalidBackendId;
        MyProc->databaseId = InvalidOid;
        MyProc->roleId = InvalidOid;
 +#ifdef XCP
 +      MyProc->coordId = InvalidOid;
 +      MyProc->coordPid = 0;
 +#endif
 +#ifdef PGXC
 +      MyProc->isPooler = false;
 +      if (IsPGXCPoolerProcess())
 +              MyProc->isPooler = true;
 +#endif
+       MyProc->isBackgroundWorker = IsBackgroundWorker;
        MyPgXact->delayChkpt = false;
        MyPgXact->vacuumFlags = 0;
        MyProc->lwWaiting = false;
index f1905d2f80a1b9845a95a81b5396029afd9ac357,28081c37654a8787e9423e75c6f768997d6872ca..6ad2d78b3efa3a25325fe704d73b4e8332b66fbf
@@@ -4,8 -4,7 +4,8 @@@
   *      support for communication destinations
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index 95cf9847721c0c453b39efc5948c8166591e3f20,75c2d9a61d0dc067e9844f986cf23e23f724e565..a4f4884372813825856cd83394824e92bf8875b8
@@@ -3,10 -3,8 +3,10 @@@
   * postgres.c
   *      POSTGRES C Backend Interface
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -1162,10 -964,9 +1173,10 @@@ exec_simple_query(const char *query_str
        /*
         * Run through the raw parsetree(s) and process each one.
         */
 -      foreach(parsetree_item, parsetree_list)
 +      forboth(parsetree_item, parsetree_list, querysource_item, querysource_list)
        {
-               Node       *parsetree = (Node *) lfirst(parsetree_item);
+               RawStmt    *parsetree = lfirst_node(RawStmt, parsetree_item);
 +              char       *querysource = (char *) lfirst(querysource_item);
                bool            snapshot_set = false;
                const char *commandTag;
                char            completionTag[COMPLETION_TAG_BUFSIZE];
@@@ -5220,15 -4416,15 +5209,15 @@@ ShowUsageCommon(const char *title, stru
  
        appendStringInfoString(&str, "! system usage stats:\n");
        appendStringInfo(&str,
-                               "!\t%ld.%06ld elapsed %ld.%06ld user %ld.%06ld system sec\n",
-                                        (long) (elapse_t.tv_sec - save_t->tv_sec),
-                                        (long) (elapse_t.tv_usec - save_t->tv_usec),
+                       "!\t%ld.%06ld s user, %ld.%06ld s system, %ld.%06ld s elapsed\n",
 -                                       (long) (r.ru_utime.tv_sec - Save_r.ru_utime.tv_sec),
 -                                       (long) (r.ru_utime.tv_usec - Save_r.ru_utime.tv_usec),
 -                                       (long) (r.ru_stime.tv_sec - Save_r.ru_stime.tv_sec),
 -                                       (long) (r.ru_stime.tv_usec - Save_r.ru_stime.tv_usec),
 -                                       (long) (elapse_t.tv_sec - Save_t.tv_sec),
 -                                       (long) (elapse_t.tv_usec - Save_t.tv_usec));
 +                                       (long) (r.ru_utime.tv_sec - save_r->ru_utime.tv_sec),
 +                                       (long) (r.ru_utime.tv_usec - save_r->ru_utime.tv_usec),
 +                                       (long) (r.ru_stime.tv_sec - save_r->ru_stime.tv_sec),
-                                        (long) (r.ru_stime.tv_usec - save_r->ru_stime.tv_usec));
++                                       (long) (r.ru_stime.tv_usec - save_r->ru_stime.tv_usec),
++                                       (long) (elapse_t.tv_sec - save_t->tv_sec),
++                                       (long) (elapse_t.tv_usec - save_t->tv_usec));
        appendStringInfo(&str,
-                                        "!\t[%ld.%06ld user %ld.%06ld sys total]\n",
+                                        "!\t[%ld.%06ld s user, %ld.%06ld s system total]\n",
                                         (long) user.tv_sec,
                                         (long) user.tv_usec,
                                         (long) sys.tv_sec,
index f3e175e475d0835c58fa4918d35f8db3bc19beca,e30aeb1c7faff6eb02533dceaa99d46e35f1fa1f..134dc6dd240490182405b4f5d3b9cd638cf2db39
@@@ -3,8 -3,7 +3,8 @@@
   * pquery.c
   *      POSTGRES process query command code
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -102,41 -93,8 +106,13 @@@ CreateQueryDesc(PlannedStmt *plannedstm
        qd->planstate = NULL;
        qd->totaltime = NULL;
  
-       return qd;
- }
- /*
-  * CreateUtilityQueryDesc
-  */
- QueryDesc *
- CreateUtilityQueryDesc(Node *utilitystmt,
-                                          const char *sourceText,
-                                          Snapshot snapshot,
-                                          DestReceiver *dest,
-                                          ParamListInfo params)
- {
-       QueryDesc  *qd = (QueryDesc *) palloc(sizeof(QueryDesc));
-       qd->operation = CMD_UTILITY;    /* operation */
-       qd->plannedstmt = NULL;
-       qd->utilitystmt = utilitystmt;          /* utility command */
-       qd->sourceText = sourceText;    /* query text */
-       qd->snapshot = RegisterSnapshot(snapshot);      /* snapshot */
-       qd->crosscheck_snapshot = InvalidSnapshot;      /* RI check snapshot */
-       qd->dest = dest;                        /* output dest */
-       qd->params = params;            /* parameter values passed into query */
-       qd->instrument_options = false;         /* uninteresting for utilities */
-       /* null these fields until set by ExecutorStart */
-       qd->tupDesc = NULL;
-       qd->estate = NULL;
-       qd->planstate = NULL;
-       qd->totaltime = NULL;
 +#ifdef XCP
 +      qd->squeue = NULL;
 +      qd->myindex = -1;
 +#endif
 +
+       /* not yet executed */
+       qd->already_executed = false;
  
        return qd;
  }
@@@ -421,15 -260,9 +394,14 @@@ ChoosePortalStrategy(List *stmts
                {
                        PlannedStmt *pstmt = (PlannedStmt *) stmt;
  
 +#ifdef XCP
 +                      if (list_length(pstmt->distributionRestrict) > 1)
 +                              return PORTAL_DISTRIBUTED;
 +#endif
 +
                        if (pstmt->canSetTag)
                        {
-                               if (pstmt->commandType == CMD_SELECT &&
-                                       pstmt->utilityStmt == NULL)
+                               if (pstmt->commandType == CMD_SELECT)
                                {
                                        if (pstmt->hasModifyingCTE)
                                                return PORTAL_ONE_MOD_WITH;
@@@ -646,204 -484,6 +627,205 @@@ PortalStart(Portal portal, ParamListInf
                 */
                switch (portal->strategy)
                {
 +#ifdef XCP
 +                      case PORTAL_DISTRIBUTED:
 +                              /* No special ability is needed */
 +                              eflags = 0;
 +                              /* Must set snapshot before starting executor. */
 +                              if (snapshot)
 +                                      PushActiveSnapshot(GetActiveSnapshot());
 +                              else
 +                                      PushActiveSnapshot(GetTransactionSnapshot());
 +
 +                              /*
 +                               * Create QueryDesc in portal's context; for the moment, set
 +                               * the destination to DestNone.
 +                               */
 +                              queryDesc = CreateQueryDesc((PlannedStmt *) linitial(portal->stmts),
 +                                                                                      portal->sourceText,
 +                                                                                      GetActiveSnapshot(),
 +                                                                                      InvalidSnapshot,
 +                                                                                      None_Receiver,
 +                                                                                      params,
++                                                                                      NULL,
 +                                                                                      0);
 +                              /*
 +                               * If parent node have sent down parameters, and at least one
 +                               * of them is PARAM_EXEC we should avoid "single execution"
 +                               * model. All parent nodes deliver the same values for
 +                               * PARAM_EXTERN since these values are provided by client and
 +                               * they are not changed during the query execution.
 +                               * On the conrary, values of PARAM_EXEC are results of execution
 +                               * on the parent node and in general diferent parents send to
 +                               * this node different values and executions are not equivalent.
 +                               * Since PARAM_EXECs are always at the end of the list we just
 +                               * need to check last item to figure out if there are any
 +                               * PARAM_EXECs.
 +                               * NB: Check queryDesc->plannedstmt->nParamExec > 0 is incorrect
 +                               * here since queryDesc->plannedstmt->nParamExec may be used
 +                               * just to allocate space for them and no actual values passed.
 +                               */
 +                              if (queryDesc->plannedstmt->nParamRemote > 0 &&
 +                                              queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
 +                              {
 +                                      int        *consMap;
 +                                      int             len;
 +                                      ListCell   *lc;
 +                                      int             i;
 +                                      Locator    *locator;
 +                                      Oid                     keytype;
 +                                      DestReceiver *dest;
 +
 +                                      len = list_length(queryDesc->plannedstmt->distributionNodes);
 +                                      consMap = (int *) palloc0(len * sizeof(int));
 +                                      queryDesc->squeue = NULL;
 +                                      queryDesc->myindex = -1;
 +                                      PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
 +                                                                                                         &PGXC_PARENT_NODE_TYPE);
 +                                      i = 0;
 +                                      foreach(lc, queryDesc->plannedstmt->distributionNodes)
 +                                      {
 +                                              if (PGXC_PARENT_NODE_ID == lfirst_int(lc))
 +                                                      consMap[i] = SQ_CONS_SELF;
 +                                              else
 +                                                      consMap[i] = SQ_CONS_NONE;
 +                                              i++;
 +                                      }
 +                                      /*
 +                                       * Multiple executions of the RemoteSubplan may lead to name
 +                                       * conflict of SharedQueue, if the subplan has more
 +                                       * RemoteSubplan nodes in the execution plan tree.
 +                                       * We need to make them unique.
 +                                       */
 +                                      RemoteSubplanMakeUnique(
 +                                                      (Node *) queryDesc->plannedstmt->planTree,
 +                                                      PGXC_PARENT_NODE_ID);
 +                                      /*
 +                                       * Call ExecutorStart to prepare the plan for execution
 +                                       */
 +                                      ExecutorStart(queryDesc, eflags);
 +
 +                                      /*
 +                                       * Set up locator if result distribution is requested
 +                                       */
 +                                      keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
 +                                                      InvalidOid :
 +                                                      queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
 +                                      locator = createLocator(
 +                                                      queryDesc->plannedstmt->distributionType,
 +                                                      RELATION_ACCESS_INSERT,
 +                                                      keytype,
 +                                                      LOCATOR_LIST_INT,
 +                                                      len,
 +                                                      consMap,
 +                                                      NULL,
 +                                                      false);
 +                                      dest = CreateDestReceiver(DestProducer);
 +                                      SetProducerDestReceiverParams(dest,
 +                                                      queryDesc->plannedstmt->distributionKey,
 +                                                      locator, queryDesc->squeue);
 +                                      queryDesc->dest = dest;
 +                              }
 +                              else
 +                              {
 +                                      int        *consMap;
 +                                      int             len;
 +
 +                                      /* Distributed data requested, bind shared queue for data exchange */
 +                                      len = list_length(queryDesc->plannedstmt->distributionNodes);
 +                                      consMap = (int *) palloc(len * sizeof(int));
 +                                      queryDesc->squeue = SharedQueueBind(portal->name,
 +                                                              queryDesc->plannedstmt->distributionRestrict,
 +                                                              queryDesc->plannedstmt->distributionNodes,
 +                                                              &queryDesc->myindex, consMap);
 +                                      if (queryDesc->myindex == -1)
 +                                      {
 +                                              /* producer */
 +                                              Locator    *locator;
 +                                              Oid                     keytype;
 +                                              DestReceiver *dest;
 +
 +                                              PG_TRY();
 +                                              {
 +                                                      /*
 +                                                       * Call ExecutorStart to prepare the plan for execution
 +                                                       */
 +                                                      ExecutorStart(queryDesc, eflags);
 +                                              }
 +                                              PG_CATCH();
 +                                              {
 +                                                      /* Ensure SharedQueue is released */
 +                                                      SharedQueueUnBind(queryDesc->squeue, true);
 +                                                      queryDesc->squeue = NULL;
 +                                                      PG_RE_THROW();
 +                                              }
 +                                              PG_END_TRY();
 +
 +                                              /*
 +                                               * This tells PortalCleanup to shut down the executor
 +                                               */
 +                                              portal->queryDesc = queryDesc;
 +
 +                                              /*
 +                                               * Some basic sanity checking against invalid remote plans.
 +                                               */
 +                                              Assert((queryDesc->plannedstmt->distributionKey == InvalidAttrNumber) ||
 +                                                         (queryDesc->plannedstmt->distributionKey <= queryDesc->tupDesc->natts));
 +
 +                                              /*
 +                                               * Set up locator if result distribution is requested
 +                                               */
 +                                              keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
 +                                                              InvalidOid :
 +                                                              queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
 +                                              locator = createLocator(
 +                                                              queryDesc->plannedstmt->distributionType,
 +                                                              RELATION_ACCESS_INSERT,
 +                                                              keytype,
 +                                                              LOCATOR_LIST_INT,
 +                                                              len,
 +                                                              consMap,
 +                                                              NULL,
 +                                                              false);
 +                                              dest = CreateDestReceiver(DestProducer);
 +                                              SetProducerDestReceiverParams(dest,
 +                                                              queryDesc->plannedstmt->distributionKey,
 +                                                              locator, queryDesc->squeue);
 +                                              queryDesc->dest = dest;
 +
 +                                              addProducingPortal(portal);
 +                                      }
 +                                      else
 +                                      {
 +                                              /*
 +                                               * We do not need to initialize executor, but need
 +                                               * a tuple descriptor
 +                                               */
 +                                              queryDesc->tupDesc = ExecCleanTypeFromTL(
 +                                                              queryDesc->plannedstmt->planTree->targetlist,
 +                                                              false);
 +                                      }
 +                                      pfree(consMap);
 +                              }
 +
 +                              portal->queryDesc = queryDesc;
 +
 +                              /*
 +                               * Remember tuple descriptor (computed by ExecutorStart)
 +                               */
 +                              portal->tupDesc = queryDesc->tupDesc;
 +
 +                              /*
 +                               * Reset cursor position data to "start of query"
 +                               */
 +                              portal->atStart = true;
 +                              portal->atEnd = false;  /* allow fetches */
 +                              portal->portalPos = 0;
 +
 +                              PopActiveSnapshot();
 +                              break;
 +#endif
 +
                        case PORTAL_ONE_SELECT:
  
                                /* Must set snapshot before starting executor. */
@@@ -1716,10 -1180,8 +1703,11 @@@ PortalRunUtility(Portal portal, Planned
                                   portal->sourceText,
                           isTopLevel ? PROCESS_UTILITY_TOPLEVEL : PROCESS_UTILITY_QUERY,
                                   portal->portalParams,
+                                  portal->queryEnv,
                                   dest,
 +#ifdef PGXC
 +                                 false,
 +#endif /* PGXC */
                                   completionTag);
  
        /* Some utility statements may change context on us */
@@@ -1834,14 -1287,8 +1819,15 @@@ PortalRunMulti(Portal portal
                                ProcessQuery(pstmt,
                                                         portal->sourceText,
                                                         portal->portalParams,
+                                                        portal->queryEnv,
                                                         dest, completionTag);
 +#ifdef PGXC
 +                              /* it's special for INSERT */
 +                              if (IS_PGXC_COORDINATOR &&
 +                                      pstmt->commandType == CMD_INSERT)
 +                                      HandleCmdComplete(pstmt->commandType, &combine,
 +                                                      completionTag, strlen(completionTag));
 +#endif
                        }
                        else
                        {
@@@ -2265,355 -1707,3 +2252,355 @@@ DoPortalRewind(Portal portal
        portal->atEnd = false;
        portal->portalPos = 0;
  }
-                               ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES);
 +
 +#ifdef XCP
 +/*
 + * Execute the specified portal's query and distribute tuples to consumers.
 + * Returs 1 if portal should keep producing, 0 if all consumers have enough
 + * rows in the buffers to pause producing temporarily, -1 if the query is
 + * completed.
 + */
 +int
 +AdvanceProducingPortal(Portal portal, bool can_wait)
 +{
 +      Portal          saveActivePortal;
 +      ResourceOwner saveResourceOwner;
 +      MemoryContext savePortalContext;
 +      MemoryContext oldContext;
 +      QueryDesc  *queryDesc;
 +      SharedQueue squeue;
 +      DestReceiver *treceiver;
 +      int                     result;
 +
 +      queryDesc = PortalGetQueryDesc(portal);
 +      squeue = queryDesc->squeue;
 +
 +      Assert(queryDesc);
 +      /* Make sure the portal is producing */
 +      Assert(squeue && queryDesc->myindex == -1);
 +      /* Make sure there is proper receiver */
 +      Assert(queryDesc->dest && queryDesc->dest->mydest == DestProducer);
 +
 +      /*
 +       * Set up global portal context pointers.
 +       */
 +      saveActivePortal = ActivePortal;
 +      saveResourceOwner = CurrentResourceOwner;
 +      savePortalContext = PortalContext;
 +      PG_TRY();
 +      {
 +              ActivePortal = portal;
 +              CurrentResourceOwner = portal->resowner;
 +              PortalContext = PortalGetHeapMemory(portal);
 +
 +              oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal));
 +
 +              /*
 +               * That is the first pass thru if the hold store is not initialized yet,
 +               * Need to initialize stuff.
 +               */
 +              if (portal->holdStore == NULL && portal->status != PORTAL_FAILED)
 +              {
 +                      int idx;
 +                      char storename[64];
 +
 +                      PortalCreateProducerStore(portal);
 +                      treceiver = CreateDestReceiver(DestTuplestore);
 +                      SetTuplestoreDestReceiverParams(treceiver,
 +                                                                                      portal->holdStore,
 +                                                                                      portal->holdContext,
 +                                                                                      false);
 +                      SetSelfConsumerDestReceiver(queryDesc->dest, treceiver);
 +                      SetProducerTempMemory(queryDesc->dest, portal->tmpContext);
 +                      snprintf(storename, 64, "%s producer store", portal->name);
 +                      tuplestore_collect_stat(portal->holdStore, storename);
 +                      /*
 +                       * Tuplestore does not clear eof flag on the active read pointer,
 +                       * causing the store is always in EOF state once reached when
 +                       * there is a single read pointer. We do not want behavior like this
 +                       * and workaround by using secondary read pointer.
 +                       * Primary read pointer (0) is active when we are writing to
 +                       * the tuple store, secondary read pointer is for reading, and its
 +                       * eof flag is cleared if a tuple is written to the store.
 +                       * We know the extra read pointer has index 1, so do not store it.
 +                       */
 +                      idx = tuplestore_alloc_read_pointer(portal->holdStore, 0);
 +                      Assert(idx == 1);
 +              }
 +
 +              if (queryDesc->estate && !queryDesc->estate->es_finished &&
 +                              portal->status != PORTAL_FAILED)
 +              {
 +                      /*
 +                       * If the portal's hold store has tuples available for read and
 +                       * all consumer queues are not empty we skip advancing the portal
 +                       * (pause it) to prevent buffering too many rows at the producer.
 +                       * NB just created portal store would not be in EOF state, but in
 +                       * this case consumer queues will be empty and do not allow
 +                       * erroneous pause. After the first call to AdvanceProducingPortal
 +                       * portal will try to read the hold store and EOF flag will be set
 +                       * correctly.
 +                       */
 +                      tuplestore_select_read_pointer(portal->holdStore, 1);
 +                      if (!tuplestore_ateof(portal->holdStore) &&
 +                                      SharedQueueCanPause(squeue))
 +                              result = 0;
 +                      else
 +                              result = 1;
 +                      tuplestore_select_read_pointer(portal->holdStore, 0);
 +
 +                      if (result)
 +                      {
 +                              /* Execute query and dispatch tuples via dest receiver */
 +#define PRODUCE_TUPLES 100
 +                              PushActiveSnapshot(queryDesc->snapshot);
++                              ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES, true);
 +                              PopActiveSnapshot();
 +
 +                              if (queryDesc->estate->es_processed < PRODUCE_TUPLES)
 +                              {
 +                                      /*
 +                                       * Finish the executor, but we may still have some tuples
 +                                       * in the local storages.
 +                                       * We should keep trying pushing them into the squeue, so do not
 +                                       * remove the portal from the list of producers.
 +                                       */
 +                                      ExecutorFinish(queryDesc);
 +                              }
 +                      }
 +              }
 +
 +              /* Try to dump local tuplestores */
 +              if ((queryDesc->estate == NULL || queryDesc->estate->es_finished) &&
 +                              ProducerReceiverPushBuffers(queryDesc->dest))
 +              {
 +                      if (can_wait && queryDesc->estate == NULL)
 +                      {
 +                              (*queryDesc->dest->rDestroy) (queryDesc->dest);
 +                              queryDesc->dest = NULL;
 +                              portal->queryDesc = NULL;
 +                              squeue = NULL;
 +
 +                              removeProducingPortal(portal);
 +                              FreeQueryDesc(queryDesc);
 +
 +                              /*
 +                               * Current context is the portal context, which is going
 +                               * to be deleted
 +                               */
 +                              MemoryContextSwitchTo(TopTransactionContext);
 +
 +                              ActivePortal = saveActivePortal;
 +                              CurrentResourceOwner = saveResourceOwner;
 +                              PortalContext = savePortalContext;
 +
 +                              if (portal->resowner)
 +                              {
 +                                      bool            isCommit = (portal->status != PORTAL_FAILED);
 +
 +                                      ResourceOwnerRelease(portal->resowner,
 +                                                                               RESOURCE_RELEASE_BEFORE_LOCKS,
 +                                                                               isCommit, false);
 +                                      ResourceOwnerRelease(portal->resowner,
 +                                                                               RESOURCE_RELEASE_LOCKS,
 +                                                                               isCommit, false);
 +                                      ResourceOwnerRelease(portal->resowner,
 +                                                                               RESOURCE_RELEASE_AFTER_LOCKS,
 +                                                                               isCommit, false);
 +                                      ResourceOwnerDelete(portal->resowner);
 +                              }
 +                              portal->resowner = NULL;
 +
 +                              /*
 +                               * Delete tuplestore if present.  We should do this even under error
 +                               * conditions; since the tuplestore would have been using cross-
 +                               * transaction storage, its temp files need to be explicitly deleted.
 +                               */
 +                              if (portal->holdStore)
 +                              {
 +                                      MemoryContext oldcontext;
 +
 +                                      oldcontext = MemoryContextSwitchTo(portal->holdContext);
 +                                      tuplestore_end(portal->holdStore);
 +                                      MemoryContextSwitchTo(oldcontext);
 +                                      portal->holdStore = NULL;
 +                              }
 +
 +                              /* delete tuplestore storage, if any */
 +                              if (portal->holdContext)
 +                                      MemoryContextDelete(portal->holdContext);
 +
 +                              /* release subsidiary storage */
 +                              MemoryContextDelete(PortalGetHeapMemory(portal));
 +
 +                              /* release portal struct (it's in PortalMemory) */
 +                              pfree(portal);
 +                      }
 +                      /* report portal is not producing */
 +                      result = -1;
 +              }
 +              else
 +              {
 +                      result = SharedQueueCanPause(queryDesc->squeue) ? 0 : 1;
 +              }
 +      }
 +      PG_CATCH();
 +      {
 +              /* Uncaught error while executing portal: mark it dead */
 +              portal->status = PORTAL_FAILED;
 +              /*
 +               * Reset producer to allow consumers to finish, so receiving node will
 +               * handle the error.
 +               */
 +              if (squeue)
 +                      SharedQueueReset(squeue, -1);
 +
 +              /* Restore global vars and propagate error */
 +              ActivePortal = saveActivePortal;
 +              CurrentResourceOwner = saveResourceOwner;
 +              PortalContext = savePortalContext;
 +
 +              PG_RE_THROW();
 +      }
 +      PG_END_TRY();
 +
 +      MemoryContextSwitchTo(oldContext);
 +
 +      ActivePortal = saveActivePortal;
 +      CurrentResourceOwner = saveResourceOwner;
 +      PortalContext = savePortalContext;
 +
 +      return result;
 +}
 +
 +
 +/*
 + * Iterate over producing portal, determine already closed, and clean them up,
 + * waiting while consumers finish their work. Closed producers should be
 + * cleaned up and resources are released before proceeding with handling of
 + * next request.
 + */
 +void
 +cleanupClosedProducers(void)
 +{
 +      ListCell   *lc = list_head(getProducingPortals());
 +      while (lc)
 +      {
 +              Portal p = (Portal) lfirst(lc);
 +              QueryDesc  *queryDesc = PortalGetQueryDesc(p);
 +              SharedQueue squeue = queryDesc->squeue;
 +
 +              /*
 +               * Get next already, because next call may remove cell from
 +               * the list and invalidate next reference
 +               */
 +              lc = lnext(lc);
 +
 +              /* When portal is closed executor state is not set */
 +              if (queryDesc->estate == NULL)
 +              {
 +                      /*
 +                       * Set up global portal context pointers.
 +                       */
 +                      Portal          saveActivePortal = ActivePortal;
 +                      ResourceOwner saveResourceOwner = CurrentResourceOwner;
 +                      MemoryContext savePortalContext = PortalContext;
 +
 +                      PG_TRY();
 +                      {
 +                              MemoryContext oldContext;
 +                              ActivePortal = p;
 +                              CurrentResourceOwner = p->resowner;
 +                              PortalContext = PortalGetHeapMemory(p);
 +
 +                              oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(p));
 +
 +                              (*queryDesc->dest->rDestroy) (queryDesc->dest);
 +                              queryDesc->dest = NULL;
 +                              p->queryDesc = NULL;
 +                              squeue = NULL;
 +
 +                              removeProducingPortal(p);
 +                              FreeQueryDesc(queryDesc);
 +
 +                              /*
 +                               * Current context is the portal context, which is going
 +                               * to be deleted
 +                               */
 +                              MemoryContextSwitchTo(TopTransactionContext);
 +
 +                              ActivePortal = saveActivePortal;
 +                              CurrentResourceOwner = saveResourceOwner;
 +                              PortalContext = savePortalContext;
 +
 +                              if (p->resowner)
 +                              {
 +                                      bool            isCommit = (p->status != PORTAL_FAILED);
 +
 +                                      ResourceOwnerRelease(p->resowner,
 +                                                                               RESOURCE_RELEASE_BEFORE_LOCKS,
 +                                                                               isCommit, false);
 +                                      ResourceOwnerRelease(p->resowner,
 +                                                                               RESOURCE_RELEASE_LOCKS,
 +                                                                               isCommit, false);
 +                                      ResourceOwnerRelease(p->resowner,
 +                                                                               RESOURCE_RELEASE_AFTER_LOCKS,
 +                                                                               isCommit, false);
 +                                      ResourceOwnerDelete(p->resowner);
 +                              }
 +                              p->resowner = NULL;
 +
 +                              /*
 +                               * Delete tuplestore if present.  We should do this even under error
 +                               * conditions; since the tuplestore would have been using cross-
 +                               * transaction storage, its temp files need to be explicitly deleted.
 +                               */
 +                              if (p->holdStore)
 +                              {
 +                                      MemoryContext oldcontext;
 +
 +                                      oldcontext = MemoryContextSwitchTo(p->holdContext);
 +                                      tuplestore_end(p->holdStore);
 +                                      MemoryContextSwitchTo(oldcontext);
 +                                      p->holdStore = NULL;
 +                              }
 +
 +                              /* delete tuplestore storage, if any */
 +                              if (p->holdContext)
 +                                      MemoryContextDelete(p->holdContext);
 +
 +                              /* release subsidiary storage */
 +                              MemoryContextDelete(PortalGetHeapMemory(p));
 +
 +                              /* release portal struct (it's in PortalMemory) */
 +                              pfree(p);
 +
 +                              MemoryContextSwitchTo(oldContext);
 +                      }
 +                      PG_CATCH();
 +                      {
 +                              /* Uncaught error while executing portal: mark it dead */
 +                              p->status = PORTAL_FAILED;
 +                              /*
 +                               * Reset producer to allow consumers to finish, so receiving node will
 +                               * handle the error.
 +                               */
 +                              if (squeue)
 +                                      SharedQueueReset(squeue, -1);
 +
 +                              /* Restore global vars and propagate error */
 +                              ActivePortal = saveActivePortal;
 +                              CurrentResourceOwner = saveResourceOwner;
 +                              PortalContext = savePortalContext;
 +
 +                              PG_RE_THROW();
 +                      }
 +                      PG_END_TRY();
 +
 +                      ActivePortal = saveActivePortal;
 +                      CurrentResourceOwner = saveResourceOwner;
 +                      PortalContext = savePortalContext;
 +              }
 +      }
 +}
 +#endif
index 7680a6451a28665369b2d8f8a8abea89565fc956,1e941fbd600276b9c99b81540b369f9ef058e642..632d51f3acae177edd6c08103bac5100560a4ee4
@@@ -5,10 -5,8 +5,10 @@@
   *      commands.  At one time acted as an interface between the Lisp and C
   *      systems.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -111,18 -78,10 +114,19 @@@ static void ProcessUtilitySlow(ParseSta
                                   const char *queryString,
                                   ProcessUtilityContext context,
                                   ParamListInfo params,
+                                  QueryEnvironment *queryEnv,
                                   DestReceiver *dest,
 +                                 bool sentToRemote,
                                   char *completionTag);
 +
 +#ifdef PGXC
 +static void ExecDropStmt(DropStmt *stmt,
 +                                      const char *queryString,
 +                                      bool sentToRemote,
 +                                      bool isTopLevel);
 +#else
  static void ExecDropStmt(DropStmt *stmt, bool isTopLevel);
 +#endif
  
  
  /*
@@@ -363,12 -336,12 +381,15 @@@ ProcessUtility(PlannedStmt *pstmt
                           const char *queryString,
                           ProcessUtilityContext context,
                           ParamListInfo params,
+                          QueryEnvironment *queryEnv,
                           DestReceiver *dest,
 +#ifdef PGXC
 +                         bool sentToRemote,
 +#endif
                           char *completionTag)
  {
+       Assert(IsA(pstmt, PlannedStmt));
+       Assert(pstmt->commandType == CMD_UTILITY);
        Assert(queryString != NULL);    /* required as of 8.4 */
  
        /*
         * call standard_ProcessUtility().
         */
        if (ProcessUtility_hook)
-               (*ProcessUtility_hook) (parsetree, queryString,
-                                                               context, params,
+               (*ProcessUtility_hook) (pstmt, queryString,
+                                                               context, params, queryEnv,
 -                                                              dest, completionTag);
 +                                                              dest,
- #ifdef PGXC
 +                                                              sentToRemote,
- #endif
 +                                                              completionTag);
        else
-               standard_ProcessUtility(parsetree, queryString,
-                                                               context, params,
+               standard_ProcessUtility(pstmt, queryString,
+                                                               context, params, queryEnv,
 -                                                              dest, completionTag);
 +                                                              dest,
- #ifdef PGXC
 +                                                              sentToRemote,
- #endif
 +                                                              completionTag);
  }
  
  /*
@@@ -410,45 -375,13 +427,48 @@@ standard_ProcessUtility(PlannedStmt *ps
                                                const char *queryString,
                                                ProcessUtilityContext context,
                                                ParamListInfo params,
+                                               QueryEnvironment *queryEnv,
                                                DestReceiver *dest,
 +#ifdef PGXC
 +                                              bool sentToRemote,
 +#endif
                                                char *completionTag)
  {
+       Node       *parsetree = pstmt->utilityStmt;
        bool            isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL);
+       ParseState *pstate;
 +#ifdef PGXC
 +      /*
 +       * For more detail see comments in function pgxc_lock_for_backup.
 +       *
 +       * Cosider the following scenario:
 +       * Imagine a two cordinator cluster CO1, CO2
 +       * Suppose a client connected to CO1 issues select pgxc_lock_for_backup()
 +       * Now assume that a client connected to CO2 issues a create table
 +       * select pgxc_lock_for_backup() would try to acquire the advisory lock
 +       * in exclusive mode, whereas create table would try to acquire the same
 +       * lock in shared mode. Both these requests will always try acquire the
 +       * lock in the same order i.e. they would both direct the request first to
 +       * CO1 and then to CO2. One of the two requests would therefore pass
 +       * and the other would fail.
 +       *
 +       * Consider another scenario:
 +       * Suppose we have a two cooridnator cluster CO1 and CO2
 +       * Assume one client connected to each coordinator
 +       * Further assume one client starts a transaction
 +       * and issues a DDL. This is an unfinished transaction.
 +       * Now assume the second client issues
 +       * select pgxc_lock_for_backup()
 +       * This request would fail because the unfinished transaction
 +       * would already hold the advisory lock.
 +       */
 +      if (IS_PGXC_LOCAL_COORDINATOR && IsNormalProcessingMode())
 +      {
 +              /* Is the statement a prohibited one? */
 +              if (!IsStmtAllowedInLockedMode(parsetree, queryString))
 +                      pgxc_lock_for_utility_stmt(parsetree);
 +      }
 +#endif
  
        check_xact_readonly(parsetree);
  
  
                case T_CreatedbStmt:
                        /* no event triggers for global objects */
 +#ifdef PGXC
 +                      if (IS_PGXC_LOCAL_COORDINATOR)
 +#endif
                        PreventTransactionChain(isTopLevel, "CREATE DATABASE");
-                       createdb((CreatedbStmt *) parsetree);
+                       createdb(pstate, (CreatedbStmt *) parsetree);
 +#ifdef PGXC
 +                      if (IS_PGXC_LOCAL_COORDINATOR)
 +                              ExecUtilityWithMessage(queryString, sentToRemote, false);
 +#endif
                        break;
  
                case T_AlterDatabaseStmt:
                        /* no event triggers for global objects */
-                       AlterDatabase((AlterDatabaseStmt *) parsetree, isTopLevel);
+                       AlterDatabase(pstate, (AlterDatabaseStmt *) parsetree, isTopLevel);
 +#ifdef PGXC
 +                      if (IS_PGXC_LOCAL_COORDINATOR)
 +                      {
 +                              /*
 +                               * If this is not a SET TABLESPACE statement, just propogate the
 +                               * cmd as usual.
 +                               */
 +                              if (!IsSetTableSpace((AlterDatabaseStmt*) parsetree))
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +                              else
 +                                      ExecUtilityWithMessage(queryString, sentToRemote, false);
 +                      }
 +#endif
                        break;
  
                case T_AlterDatabaseSetStmt:
                         */
                case T_CreateRoleStmt:
                        /* no event triggers for global objects */
-                       CreateRole((CreateRoleStmt *) parsetree);
+                       CreateRole(pstate, (CreateRoleStmt *) parsetree);
 +#ifdef PGXC
 +                      if (IS_PGXC_LOCAL_COORDINATOR)
 +                              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                        break;
  
                case T_AlterRoleStmt:
                                GrantStmt  *stmt = (GrantStmt *) parsetree;
  
                                if (EventTriggerSupportsGrantObjectType(stmt->objtype))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
-                                       ExecuteGrantStmt((GrantStmt *) parsetree);
+                                       ExecuteGrantStmt(stmt);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                              {
 +                                      RemoteQueryExecType remoteExecType = EXEC_ON_ALL_NODES;
 +                                      GrantStmt *stmt = (GrantStmt *) parsetree;
 +                                      bool is_temp = false;
 +
 +                                      /* Launch GRANT on Coordinator if object is a sequence */
 +                                      if ((stmt->objtype == ACL_OBJECT_RELATION &&
 +                                                              stmt->targtype == ACL_TARGET_OBJECT))
 +                                      {
 +                                              /*
 +                                               * In case object is a relation, differenciate the case
 +                                               * of a sequence, a view and a table
 +                                               */
 +                                              ListCell   *cell;
 +                                              /* Check the list of objects */
 +                                              bool            first = true;
 +                                              RemoteQueryExecType type_local = remoteExecType;
 +
 +                                              foreach (cell, stmt->objects)
 +                                              {
 +                                                      RangeVar   *relvar = (RangeVar *) lfirst(cell);
 +                                                      Oid                     relid = RangeVarGetRelid(relvar, NoLock, true);
 +
 +                                                      /* Skip if object does not exist */
 +                                                      if (!OidIsValid(relid))
 +                                                              continue;
 +
 +                                                      remoteExecType = ExecUtilityFindNodesRelkind(relid, &is_temp);
 +
 +                                                      /* Check if object node type corresponds to the first one */
 +                                                      if (first)
 +                                                      {
 +                                                              type_local = remoteExecType;
 +                                                              first = false;
 +                                                      }
 +                                                      else
 +                                                      {
 +                                                              if (type_local != remoteExecType)
 +                                                                      ereport(ERROR,
 +                                                                                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                                                                                       errmsg("PGXC does not support GRANT on multiple object types"),
 +                                                                                       errdetail("Grant VIEW/TABLE with separate queries")));
 +                                                      }
 +                                              }
 +                                      }
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, remoteExecType, is_temp);
 +                              }
 +#endif
                        }
                        break;
  
                                DropStmt   *stmt = (DropStmt *) parsetree;
  
                                if (EventTriggerSupportsObjectType(stmt->removeType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
 -                                      ExecDropStmt(stmt, isTopLevel);
 +                                      ExecDropStmt(stmt, queryString, sentToRemote, isTopLevel);
                        }
                        break;
  
                                RenameStmt *stmt = (RenameStmt *) parsetree;
  
                                if (EventTriggerSupportsObjectType(stmt->renameType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
                                        ExecRenameStmt(stmt);
                        }
                                AlterObjectDependsStmt *stmt = (AlterObjectDependsStmt *) parsetree;
  
                                if (EventTriggerSupportsObjectType(stmt->objectType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
                                        ExecAlterObjectDependsStmt(stmt, NULL);
                        }
                                AlterObjectSchemaStmt *stmt = (AlterObjectSchemaStmt *) parsetree;
  
                                if (EventTriggerSupportsObjectType(stmt->objectType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
                                        ExecAlterObjectSchemaStmt(stmt, NULL);
                        }
                                AlterOwnerStmt *stmt = (AlterOwnerStmt *) parsetree;
  
                                if (EventTriggerSupportsObjectType(stmt->objectType))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
                                        ExecAlterOwnerStmt(stmt);
 +
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                        }
                        break;
  
                                CommentStmt *stmt = (CommentStmt *) parsetree;
  
                                if (EventTriggerSupportsObjectType(stmt->objtype))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
-                                       CommentObject((CommentStmt *) parsetree);
+                                       CommentObject(stmt);
+                               break;
                        }
 +#ifdef PGXC
 +                      {
 +                              /* Comment objects depending on their object and temporary types */
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                              {
 +                                      bool is_temp = false;
 +                                      CommentStmt *stmt = (CommentStmt *) parsetree;
 +                                      RemoteQueryExecType exec_type = GetNodesForCommentUtility(stmt, &is_temp);
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp);
 +                              }
 +                      }
 +#endif
 +                      break;
  
                case T_SecLabelStmt:
                        {
                                SecLabelStmt *stmt = (SecLabelStmt *) parsetree;
  
                                if (EventTriggerSupportsObjectType(stmt->objtype))
-                                       ProcessUtilitySlow(parsetree, queryString,
-                                                                          context, params,
+                                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                                          context, params, queryEnv,
 -                                                                         dest, completionTag);
 +                                                                         dest,
 +                                                                         sentToRemote,
 +                                                                         completionTag);
                                else
                                        ExecSecLabelStmt(stmt);
                                break;
  
                default:
                        /* All other statement types have event trigger support */
-                       ProcessUtilitySlow(parsetree, queryString,
-                                                          context, params,
+                       ProcessUtilitySlow(pstate, pstmt, queryString,
+                                                          context, params, queryEnv,
 -                                                         dest, completionTag);
 +                                                         dest,
 +                                                         sentToRemote,
 +                                                         completionTag);
                        break;
        }
+       free_parsestate(pstate);
  }
  
  /*
@@@ -1476,10 -945,11 +1497,12 @@@ ProcessUtilitySlow(ParseState *pstate
                                   const char *queryString,
                                   ProcessUtilityContext context,
                                   ParamListInfo params,
+                                  QueryEnvironment *queryEnv,
                                   DestReceiver *dest,
 +                                 bool sentToRemote,
                                   char *completionTag)
  {
+       Node       *parsetree = pstmt->utilityStmt;
        bool            isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL);
        bool            isCompleteQuery = (context <= PROCESS_UTILITY_QUERY);
        bool            needCleanup;
                                 * relation and attribute manipulation
                                 */
                        case T_CreateSchemaStmt:
- #ifdef PGXC
-                               CreateSchemaCommand((CreateSchemaStmt *) parsetree,
-                                                                       queryString, sentToRemote);
- #else                         
                                CreateSchemaCommand((CreateSchemaStmt *) parsetree,
-                                                                       queryString);
- #endif                                
 -                                                                      queryString,
++                                                                      queryString, sentToRemote,
+                                                                       pstmt->stmt_location,
+                                                                       pstmt->stmt_len);
  
                                /*
                                 * EventTriggerCollectSimpleCommand called by
                                                                                   queryString,
                                                                                   PROCESS_UTILITY_SUBCOMMAND,
                                                                                   params,
+                                                                                  NULL,
                                                                                   None_Receiver,
 +#ifdef PGXC
 +                                                                                 true,
 +#endif                                                                                
                                                                                   NULL);
                                                }
  
                                                                                           queryString,
                                                                                           PROCESS_UTILITY_SUBCOMMAND,
                                                                                           params,
+                                                                                          NULL,
                                                                                           None_Receiver,
 +#ifdef PGXC
 +                                                                                         true,
 +#endif /* PGXC */
                                                                                           NULL);
                                                                EventTriggerAlterTableStart(parsetree);
                                                                EventTriggerAlterTableRelid(relid);
                                break;
  
                        case T_CreateExtensionStmt:
-                               address = CreateExtension((CreateExtensionStmt *) parsetree);
+                               address = CreateExtension(pstate, (CreateExtensionStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                                break;
  
                        case T_AlterExtensionStmt:
-                               address = ExecAlterExtensionStmt((AlterExtensionStmt *) parsetree);
+                               address = ExecAlterExtensionStmt(pstate, (AlterExtensionStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                                break;
  
                        case T_AlterExtensionContentsStmt:
                                break;
  
                        case T_AlterEnumStmt:           /* ALTER TYPE (enum) */
-                               address = AlterEnum((AlterEnumStmt *) parsetree, isTopLevel);
+                               address = AlterEnum((AlterEnumStmt *) parsetree);
 +#ifdef PGXC
 +                              /*
 +                               * In this case force autocommit, this transaction cannot be launched
 +                               * inside a transaction block.
 +                               */
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote,
 +                                                      true, EXEC_ON_ALL_NODES, false);
 +#endif
                                break;
  
                        case T_ViewStmt:        /* CREATE VIEW */
                                EventTriggerAlterTableStart(parsetree);
-                               address = DefineView((ViewStmt *) parsetree, queryString);
+                               address = DefineView((ViewStmt *) parsetree, queryString,
+                                                                        pstmt->stmt_location, pstmt->stmt_len);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                              {
 +                                      ViewStmt *stmt = (ViewStmt *) parsetree;
 +
 +                                      if (stmt->view->relpersistence != RELPERSISTENCE_TEMP)
 +                                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false);
 +                              }
 +#endif
                                EventTriggerCollectSimpleCommand(address, secondaryObject,
                                                                                                 parsetree);
                                /* stashed internally */
                                break;
  
                        case T_CreateFunctionStmt:      /* CREATE FUNCTION */
-                               address = CreateFunction((CreateFunctionStmt *) parsetree, queryString);
+                               address = CreateFunction(pstate, (CreateFunctionStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                                break;
  
                        case T_AlterFunctionStmt:       /* ALTER FUNCTION */
-                               address = AlterFunction((AlterFunctionStmt *) parsetree);
+                               address = AlterFunction(pstate, (AlterFunctionStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                                break;
  
                        case T_RuleStmt:        /* CREATE RULE */
                                break;
  
                        case T_CreateSeqStmt:
-                               address = DefineSequence((CreateSeqStmt *) parsetree);
+                               address = DefineSequence(pstate, (CreateSeqStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                              {
 +                                      CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree;
 +
 +                                      /* In case this query is related to a SERIAL execution, just bypass */
 +                                      if (!stmt->is_serial)
 +                                      {
 +                                              bool is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP;
 +                                              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp);
 +                                      }
 +                              }
 +#endif
                                break;
  
                        case T_AlterSeqStmt:
-                               address = AlterSequence((AlterSeqStmt *) parsetree);
+                               address = AlterSequence(pstate, (AlterSeqStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                              {
 +                                      AlterSeqStmt *stmt = (AlterSeqStmt *) parsetree;
 +
 +                                      /* In case this query is related to a SERIAL execution, just bypass */
 +                                      if (!stmt->is_serial)
 +                                      {
 +                                              bool              is_temp;
 +                                              RemoteQueryExecType exec_type;
 +                                              Oid                                     relid = RangeVarGetRelid(stmt->sequence, NoLock, true);
 +
 +                                              if (!OidIsValid(relid))
 +                                                      break;
 +
 +                                              exec_type = ExecUtilityFindNodes(OBJECT_SEQUENCE,
 +                                                              relid,
 +                                                              &is_temp);
 +
 +                                              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp);
 +                                      }
 +                              }
 +#endif
                                break;
  
                        case T_CreateTableAsStmt:
                                address = ExecCreateTableAs((CreateTableAsStmt *) parsetree,
-                                                                                queryString, params, completionTag);
+                                                                                       queryString, params, queryEnv,
+                                                                                       completionTag);
 +#ifdef PGXC
 +                              if ((IS_PGXC_COORDINATOR) && !IsConnFromCoord())
 +                              {
 +                                      CreateTableAsStmt *stmt = (CreateTableAsStmt *) parsetree;
 +
 +                                      /*
 +                                       * CTAS for normal tables should have been rewritten as a
 +                                       * CREATE TABLE + SELECT INTO
 +                                       */
 +                                      Assert(stmt->relkind == OBJECT_MATVIEW);
 +                                      if (stmt->into->rel->relpersistence != RELPERSISTENCE_TEMP)
 +                                                      ExecUtilityStmtOnNodes(queryString, NULL,
 +                                                                      sentToRemote, false, EXEC_ON_COORDS, false);
 +                              }
 +#endif
                                break;
  
                        case T_RefreshMatViewStmt:
                                break;
  
                        case T_AlterTSConfigurationStmt:
-                               address = AlterTSConfiguration((AlterTSConfigurationStmt *) parsetree);
+                               AlterTSConfiguration((AlterTSConfigurationStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
+                               /*
+                                * Commands are stashed in MakeConfigurationMapping and
+                                * DropConfigurationMapping, which are called from
+                                * AlterTSConfiguration
+                                */
+                               commandCollected = true;
                                break;
  
                        case T_AlterTableMoveAllStmt:
                                break;
  
                        case T_AlterDefaultPrivilegesStmt:
-                               ExecAlterDefaultPrivilegesStmt((AlterDefaultPrivilegesStmt *) parsetree);
+                               ExecAlterDefaultPrivilegesStmt(pstate, (AlterDefaultPrivilegesStmt *) parsetree);
 +
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                                EventTriggerCollectAlterDefPrivs((AlterDefaultPrivilegesStmt *) parsetree);
                                commandCollected = true;
                                break;
  
                        case T_CreateAmStmt:
                                address = CreateAccessMethod((CreateAmStmt *) parsetree);
 +#ifdef PGXC
 +                              if (IS_PGXC_LOCAL_COORDINATOR)
 +                                      ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
 +#endif
                                break;
  
+                       case T_CreatePublicationStmt:
+                               address = CreatePublication((CreatePublicationStmt *) parsetree);
+                               break;
+                       case T_AlterPublicationStmt:
+                               AlterPublication((AlterPublicationStmt *) parsetree);
+                               /*
+                                * AlterPublication calls EventTriggerCollectSimpleCommand
+                                * directly
+                                */
+                               commandCollected = true;
+                               break;
+                       case T_CreateSubscriptionStmt:
+                               address = CreateSubscription((CreateSubscriptionStmt *) parsetree,
+                                                                                        isTopLevel);
+                               break;
+                       case T_AlterSubscriptionStmt:
+                               address = AlterSubscription((AlterSubscriptionStmt *) parsetree);
+                               break;
+                       case T_DropSubscriptionStmt:
+                               DropSubscription((DropSubscriptionStmt *) parsetree, isTopLevel);
+                               /* no commands stashed for DROP */
+                               commandCollected = true;
+                               break;
+                       case T_CreateStatsStmt:
+                               address = CreateStatistics((CreateStatsStmt *) parsetree);
+                               break;
+                       case T_AlterCollationStmt:
+                               address = AlterCollation((AlterCollationStmt *) parsetree);
+                               break;
                        default:
                                elog(ERROR, "unrecognized node type: %d",
                                         (int) nodeTag(parsetree));
@@@ -4219,548 -3360,3 +4383,548 @@@ GetCommandLogLevel(Node *parsetree
  
        return lev;
  }
-       if (stmt->objtype == OBJECT_DATABASE && list_length(stmt->objname) == 1)
 +
 +#ifdef PGXC
 +
 +/*
 + * ExecUtilityWithMessage:
 + * Execute the query on remote nodes in a transaction block.
 + * If this fails on one of the nodes :
 + *            Add a context message containing the failed node names.
 + *            Rethrow the error with the message about the failed nodes.
 + * If all are successful, just return.
 + */
 +      static void
 +ExecUtilityWithMessage(const char *queryString, bool sentToRemote, bool is_temp)
 +{
 +      PG_TRY();
 +      {
 +              ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp);
 +      }
 +      PG_CATCH();
 +      {
 +
 +              /*
 +               * Some nodes failed. Add context about what all nodes the query
 +               * failed
 +               */
 +              ExecNodes *coord_success_nodes = NULL;
 +              ExecNodes *data_success_nodes = NULL;
 +              char *msg_failed_nodes;
 +
 +              pgxc_all_success_nodes(&data_success_nodes, &coord_success_nodes, &msg_failed_nodes);
 +              if (msg_failed_nodes)
 +                      errcontext("%s", msg_failed_nodes);
 +              PG_RE_THROW();
 +      }
 +      PG_END_TRY();
 +
 +
 +}
 +
 +/*
 + * Execute a Utility statement on nodes, including Coordinators
 + * If the DDL is received from a remote Coordinator,
 + * it is not possible to push down DDL to Datanodes
 + * as it is taken in charge by the remote Coordinator.
 + */
 +      static void
 +ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool sentToRemote,
 +              bool force_autocommit, RemoteQueryExecType exec_type, bool is_temp)
 +{
 +      /* Return if query is launched on no nodes */
 +      if (exec_type == EXEC_ON_NONE)
 +              return;
 +
 +      /* Nothing to be done if this statement has been sent to the nodes */
 +      if (sentToRemote)
 +              return;
 +
 +      /* If no Datanodes defined, the query cannot be launched */
 +      if (NumDataNodes == 0)
 +              ereport(ERROR,
 +                              (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                               errmsg("No Datanode defined in cluster"),
 +                               errhint("You need to define at least 1 Datanode with "
 +                                       "CREATE NODE.")));
 +
 +      if (!IsConnFromCoord())
 +      {
 +              RemoteQuery *step = makeNode(RemoteQuery);
 +              step->combine_type = COMBINE_TYPE_SAME;
 +              step->exec_nodes = nodes;
 +              step->sql_statement = pstrdup(queryString);
 +              step->force_autocommit = force_autocommit;
 +              step->exec_type = exec_type;
 +              ExecRemoteUtility(step);
 +              pfree(step->sql_statement);
 +              pfree(step);
 +      }
 +}
 +
 +/*
 + * ExecUtilityFindNodes
 + *
 + * Determine the list of nodes to launch query on.
 + * This depends on temporary nature of object and object type.
 + * Return also a flag indicating if relation is temporary.
 + *
 + * If object is a RULE, the object id sent is that of the object to which the
 + * rule is applicable.
 + */
 +      static RemoteQueryExecType
 +ExecUtilityFindNodes(ObjectType object_type,
 +              Oid object_id,
 +              bool *is_temp)
 +{
 +      RemoteQueryExecType exec_type;
 +
 +      switch (object_type)
 +      {
 +              case OBJECT_SEQUENCE:
 +                      *is_temp = IsTempTable(object_id);
 +                      exec_type = EXEC_ON_ALL_NODES;
 +                      break;
 +
 +              case OBJECT_TABLE:
 +                      /* Do the check on relation kind */
 +                      exec_type = ExecUtilityFindNodesRelkind(object_id, is_temp);
 +                      break;
 +
 +                      /*
 +                       * Views and rules, both permanent or temporary are created
 +                       * on Coordinators only.
 +                       */
 +              case OBJECT_RULE:
 +              case OBJECT_VIEW:
 +              case OBJECT_MATVIEW:
 +                      /* Check if object is a temporary view */
 +                      if ((*is_temp = IsTempTable(object_id)))
 +                              exec_type = EXEC_ON_NONE;
 +                      else
 +                              exec_type = EXEC_ON_COORDS;
 +                      break;
 +
 +              case OBJECT_INDEX:
 +                      /* Check if given index uses temporary tables */
 +                      {
 +                              Relation        rel;
 +                              bool            is_matview;
 +
 +                              rel = relation_open(object_id, NoLock);
 +                              
 +                              *is_temp = (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP);
 +                              is_matview = (rel->rd_rel->relkind == RELKIND_MATVIEW);
 +                              
 +                              relation_close(rel, NoLock);
 +
 +                              exec_type = EXEC_ON_NONE;
 +                              if (*is_temp)
 +                              {
 +                                      if (!is_matview)
 +                                              exec_type = EXEC_ON_DATANODES;
 +                              }
 +                              else
 +                              {
 +                                      if (!is_matview)
 +                                              exec_type = EXEC_ON_ALL_NODES;
 +                                      else
 +                                              exec_type = EXEC_ON_COORDS;
 +                              }
 +                      }
 +                      break;
 +
 +              default:
 +                      *is_temp = false;
 +                      exec_type = EXEC_ON_ALL_NODES;
 +                      break;
 +      }
 +
 +      return exec_type;
 +}
 +
 +/*
 + * ExecUtilityFindNodesRelkind
 + *
 + * Get node execution and temporary type
 + * for given relation depending on its relkind
 + */
 +static RemoteQueryExecType
 +ExecUtilityFindNodesRelkind(Oid relid, bool *is_temp)
 +{
 +      char relkind_str = get_rel_relkind(relid);
 +      RemoteQueryExecType exec_type;
 +
 +      switch (relkind_str)
 +      {
 +              case RELKIND_SEQUENCE:
 +              case RELKIND_RELATION:
 +                      if ((*is_temp = IsTempTable(relid)))
 +                      {
 +                              if (IsLocalTempTable(relid))
 +                                      exec_type = EXEC_ON_NONE;
 +                              else
 +                                      exec_type = EXEC_ON_DATANODES;
 +                      }
 +                      else
 +                              exec_type = EXEC_ON_ALL_NODES;
 +                      break;
 +
 +              case RELKIND_INDEX:
 +                      {
 +                              HeapTuple   tuple;
 +                              Oid table_relid = InvalidOid;
 +
 +                              tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(relid));
 +                              if (HeapTupleIsValid(tuple))
 +                              {
 +                                      Form_pg_index index = (Form_pg_index) GETSTRUCT(tuple);
 +                                      table_relid = index->indrelid;
 +
 +                                      /* Release system cache BEFORE looking at the parent table */
 +                                      ReleaseSysCache(tuple);
 +                                      return ExecUtilityFindNodesRelkind(table_relid, is_temp);
 +                              }
 +                              else
 +                              {
 +                                      exec_type = EXEC_ON_NONE;
 +                                      *is_temp = false;
 +                              }
 +                      }
 +                      break;
 +
 +              case RELKIND_VIEW:
 +                      if ((*is_temp = IsTempTable(relid)))
 +                              exec_type = EXEC_ON_NONE;
 +                      else
 +                              exec_type = EXEC_ON_COORDS;
 +                      break;
 +
 +              case RELKIND_MATVIEW:
 +                      /* Check if object is a temporary view */
 +                      if ((*is_temp = IsTempTable(relid)))
 +                              exec_type = EXEC_ON_NONE;
 +                      else
 +                              exec_type = EXEC_ON_COORDS;
 +                      break;
 +
 +              default:
 +                      *is_temp = false;
 +                      exec_type = EXEC_ON_ALL_NODES;
 +                      break;
 +      }
 +
 +      return exec_type;
 +}
 +#endif
 +
 +#ifdef PGXC
 +/*
 + * IsStmtAllowedInLockedMode
 + *
 + * Allow/Disallow a utility command while cluster is locked
 + * A statement will be disallowed if it makes such changes
 + * in catalog that are backed up by pg_dump except
 + * CREATE NODE that has to be allowed because
 + * a new node has to be created while the cluster is still
 + * locked for backup
 + */
 +static bool
 +IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString)
 +{
 +#define ALLOW         1
 +#define DISALLOW      0
 +
 +      switch (nodeTag(parsetree))
 +      {
 +              /* To allow creation of temp tables */
 +              case T_CreateStmt:                                      /* CREATE TABLE */
 +                      {
 +                              CreateStmt *stmt = (CreateStmt *) parsetree;
 +                              if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP)
 +                                      return ALLOW;
 +                              return DISALLOW;
 +                      }
 +                      break;
 +
 +              case T_ExecuteStmt:                                     /*
 +                                                                                                                               * Prepared statememts can only have
 +                                                                                                                               * SELECT, INSERT, UPDATE, DELETE,
 +                                                                                                                               * or VALUES statement, there is no
 +                                                                                                                               * point stopping EXECUTE.
 +                                                                                                                               */
 +              case T_CreateNodeStmt:                          /*
 +                                                                                                               * This has to be allowed so that the new node
 +                                                                                                               * can be created, while the cluster is still
 +                                                                                                               * locked for backup
 +                                                                                                               */
 +              case T_DropNodeStmt:                            /*
 +                                                                                                               * This has to be allowed so that DROP NODE
 +                                                                                                               * can be issued to drop a node that has crashed.
 +                                                                                                               * Otherwise system would try to acquire a shared
 +                                                                                                               * advisory lock on the crashed node.
 +                                                                                                               */
 +
 +              case T_AlterNodeStmt:                                                   /*
 +                                                                                                               * This has to be
 +                                                                                                               * allowed so that
 +                                                                                                               * ALTER NODE can be
 +                                                                                                               * issued in case a
 +                                                                                                               * datanode or
 +                                                                                                               * coordinator failover
 +                                                                                                               */  
 +              case T_TransactionStmt:
 +              case T_PlannedStmt:
 +              case T_ClosePortalStmt:
 +              case T_FetchStmt:
 +              case T_TruncateStmt:
 +              case T_CopyStmt:
 +              case T_PrepareStmt:                                     /*
 +                                                                                                                               * Prepared statememts can only have
 +                                                                                                                               * SELECT, INSERT, UPDATE, DELETE,
 +                                                                                                                               * or VALUES statement, there is no
 +                                                                                                                               * point stopping PREPARE.
 +                                                                                                                               */
 +              case T_DeallocateStmt:                          /*
 +                                                                                                               * If prepare is allowed the deallocate should
 +                                                                                                               * be allowed also
 +                                                                                                               */
 +              case T_DoStmt:
 +              case T_NotifyStmt:
 +              case T_ListenStmt:
 +              case T_UnlistenStmt:
 +              case T_LoadStmt:
 +              case T_ClusterStmt:
 +              case T_VacuumStmt:
 +              case T_ExplainStmt:
 +              case T_VariableSetStmt:
 +              case T_VariableShowStmt:
 +              case T_DiscardStmt:
 +              case T_LockStmt:
 +              case T_ConstraintsSetStmt:
 +              case T_CheckPointStmt:
 +              case T_BarrierStmt:
 +              case T_ReindexStmt:
 +              case T_RemoteQuery:
 +              case T_CleanConnStmt:
 +#ifdef XCP
 +              case T_PauseClusterStmt:
 +#endif
 +                      return ALLOW;
 +
 +              default:
 +                      return DISALLOW;
 +      }
 +      return DISALLOW;
 +}
 +
 +/*
 + * GetCommentObjectId
 + * TODO Change to return the nodes to execute the utility on
 + *
 + * Return Object ID of object commented
 + * Note: This function uses portions of the code of CommentObject,
 + * even if this code is duplicated this is done like this to facilitate
 + * merges with PostgreSQL head.
 + */
 +static RemoteQueryExecType
 +GetNodesForCommentUtility(CommentStmt *stmt, bool *is_temp)
 +{
 +      ObjectAddress           address;
 +      Relation                        relation;
 +      RemoteQueryExecType     exec_type = EXEC_ON_ALL_NODES;  /* By default execute on all nodes */
 +      Oid                                     object_id;
 +
-               char       *database = strVal(linitial(stmt->objname));
++      if (stmt->objtype == OBJECT_DATABASE)
 +      {
-       address = get_object_address(stmt->objtype, stmt->objname, stmt->objargs,
++              char       *database = strVal((Value *) stmt->object);
 +              if (!OidIsValid(get_database_oid(database, true)))
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_UNDEFINED_DATABASE),
 +                                       errmsg("database \"%s\" does not exist", database)));
 +              /* No clue, return the default one */
 +              return exec_type;
 +      }
 +
-                       char *rulename = strVal(llast(stmt->objname));
++      address = get_object_address(stmt->objtype, stmt->object,
 +                      &relation, ShareUpdateExclusiveLock, false);
 +      object_id = address.objectId;
 +
 +      /*
 +       * If the object being commented is a rule, the nodes are decided by the
 +       * object to which rule is applicable, so get the that object's oid
 +       */
 +      if (stmt->objtype == OBJECT_RULE)
 +      {
 +              if (!relation && !OidIsValid(relation->rd_id))
 +              {
 +                      /* This should not happen, but prepare for the worst */
-                                               objname, NIL,
++                      char *rulename = strVal(llast(castNode(List, stmt->object)));
 +                      ereport(WARNING,
 +                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                                       errmsg("can not find relation for rule \"%s\" does not exist", rulename)));
 +                      object_id = InvalidOid;
 +              }
 +              else
 +                      object_id = RelationGetRelid(relation);
 +      }
 +
 +      if (relation != NULL)
 +              relation_close(relation, NoLock);
 +
 +      /* Commented object may not have a valid object ID, so move to default */
 +      if (OidIsValid(object_id))
 +              exec_type = ExecUtilityFindNodes(stmt->objtype,
 +                              object_id,
 +                              is_temp);
 +      return exec_type;
 +}
 +
 +/*
 + * GetNodesForRulesUtility
 + * Get the nodes to execute this RULE related utility statement.
 + * A rule is expanded on Coordinator itself, and does not need any
 + * existence on Datanode. In fact, if it were to exist on Datanode,
 + * there is a possibility that it would expand again
 + */
 +static RemoteQueryExecType
 +GetNodesForRulesUtility(RangeVar *relation, bool *is_temp)
 +{
 +      Oid relid = RangeVarGetRelid(relation, NoLock, true);
 +      RemoteQueryExecType exec_type;
 +
 +      /* Skip if this Oid does not exist */
 +      if (!OidIsValid(relid))
 +              return EXEC_ON_NONE;
 +
 +      /*
 +       * PGXCTODO: See if it's a temporary object, do we really need
 +       * to care about temporary objects here? What about the
 +       * temporary objects defined inside the rule?
 +       */
 +      exec_type = ExecUtilityFindNodes(OBJECT_RULE, relid, is_temp);
 +      return exec_type;
 +}
 +
 +/*
 + * TreatDropStmtOnCoord
 + * Do a pre-treatment of Drop statement on a remote Coordinator
 + */
 +static void
 +DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote,
 +              bool *is_temp, RemoteQueryExecType *exec_type)
 +{
 +      bool            res_is_temp = false;
 +      RemoteQueryExecType res_exec_type = EXEC_ON_ALL_NODES;
 +
 +      /* Nothing to do if not local Coordinator */
 +      if (IS_PGXC_DATANODE || IsConnFromCoord())
 +              return;
 +
 +      switch (stmt->removeType)
 +      {
 +              case OBJECT_TABLE:
 +              case OBJECT_SEQUENCE:
 +              case OBJECT_VIEW:
 +              case OBJECT_INDEX:
 +              case OBJECT_MATVIEW:
 +                      {
 +                              /*
 +                               * Check the list of objects going to be dropped.
 +                               * XC does not allow yet to mix drop of temporary and
 +                               * non-temporary objects because this involves to rewrite
 +                               * query to process for tables.
 +                               */
 +                              ListCell   *cell;
 +                              bool            is_first = true;
 +
 +                              foreach(cell, stmt->objects)
 +                              {
 +                                      RangeVar   *rel = makeRangeVarFromNameList((List *) lfirst(cell));
 +                                      Oid         relid;
 +
 +                                      /*
 +                                       * Do not print result at all, error is thrown
 +                                       * after if necessary
 +                                       */
 +                                      relid = RangeVarGetRelid(rel, NoLock, true);
 +
 +                                      /*
 +                                       * In case this relation ID is incorrect throw
 +                                       * a correct DROP error.
 +                                       */
 +                                      if (!OidIsValid(relid) && !stmt->missing_ok)
 +                                              DropTableThrowErrorExternal(rel,
 +                                                              stmt->removeType,
 +                                                              stmt->missing_ok);
 +
 +                                      /* In case of DROP ... IF EXISTS bypass */
 +                                      if (!OidIsValid(relid) && stmt->missing_ok)
 +                                              continue;
 +
 +                                      if (is_first)
 +                                      {
 +                                              res_exec_type = ExecUtilityFindNodes(stmt->removeType,
 +                                                              relid,
 +                                                              &res_is_temp);
 +                                              is_first = false;
 +                                      }
 +                                      else
 +                                      {
 +                                              RemoteQueryExecType exec_type_loc;
 +                                              bool is_temp_loc;
 +                                              exec_type_loc = ExecUtilityFindNodes(stmt->removeType,
 +                                                              relid,
 +                                                              &is_temp_loc);
 +                                              if (exec_type_loc != res_exec_type ||
 +                                                              is_temp_loc != res_is_temp)
 +                                                      ereport(ERROR,
 +                                                                      (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 +                                                                       errmsg("DROP not supported for TEMP and non-TEMP objects"),
 +                                                                       errdetail("You should separate TEMP and non-TEMP objects")));
 +                                      }
 +                              }
 +                      }
 +                      break;
 +
 +              case OBJECT_RULE:
 +                      {
 +                              /*
 +                               * In the case of a rule we need to find the object on
 +                               * which the rule is dependent and define if this rule
 +                               * has a dependency with a temporary object or not.
 +                               */
 +                              List *objname = linitial(stmt->objects);
 +                              Relation    relation = NULL;
 +
 +                              get_object_address(OBJECT_RULE,
++                                              objname, /* XXX PG10MERGE: check if this is ok */
 +                                              &relation,
 +                                              AccessExclusiveLock,
 +                                              stmt->missing_ok);
 +
 +                              /* Do nothing if no relation */
 +                              if (relation && OidIsValid(relation->rd_id))
 +                                      res_exec_type = ExecUtilityFindNodes(OBJECT_RULE,
 +                                                      relation->rd_id,
 +                                                      &res_is_temp);
 +                              else
 +                                      res_exec_type = EXEC_ON_NONE;
 +
 +                              /* Close relation if necessary */
 +                              if (relation)
 +                                      relation_close(relation, NoLock);
 +                      }
 +                      break;
 +
 +              default:
 +                      res_is_temp = false;
 +                      res_exec_type = EXEC_ON_ALL_NODES;
 +                      break;
 +      }
 +
 +      /* Save results */
 +      *is_temp = res_is_temp;
 +      *exec_type = res_exec_type;
 +}
 +#endif
index 8ee878e1128ec35afd0c51f66e9fc49da443f6a5,d9c8aa569c9198c09a769fc7bf30c79d3b5b68a0..2c21a43d73f784ff02365eb1997893e65c54b751
@@@ -3,8 -3,7 +3,8 @@@
   * arrayfuncs.c
   *      Support functions for arrays.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index ff01b5f702efb1ba9c0549f6fa98c7e8532f2c12,76ab9496e2ebe3700563679856a1f06e93062853..baf2957c1a0ccba7ac2c84ceb56e8ae6bc5f669f
@@@ -3,8 -3,7 +3,8 @@@
   * date.c
   *      implements DATE and TIME data types specified in SQL standard
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994-5, Regents of the University of California
   *
   *
index 770198fdb460d3aa501f22210a681ebe7e1700c5,f0725860b4bfada10d898d20f8621efd39914265..c1446bb2a31c97b9e90906224804cc62fc7be494
@@@ -2,8 -2,7 +2,8 @@@
   * dbsize.c
   *            Database object size functions, and related inquiries
   *
-  * Copyright (c) 2002-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 2002-2017, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
   *      src/backend/utils/adt/dbsize.c
@@@ -19,7 -17,7 +18,8 @@@
  #include "access/htup_details.h"
  #include "catalog/catalog.h"
  #include "catalog/namespace.h"
+ #include "catalog/pg_authid.h"
 +#include "catalog/pg_namespace.h"
  #include "catalog/pg_tablespace.h"
  #include "commands/dbcommands.h"
  #include "commands/tablespace.h"
@@@ -150,14 -124,8 +156,14 @@@ calculate_database_size(Oid dbOid
                        strcmp(direntry->d_name, "..") == 0)
                        continue;
  
-               snprintf(pathname, MAXPGPATH, "pg_tblspc/%s/%s_%s/%u",
 +#ifdef PGXC
 +              /* Postgres-XC tablespaces include node name in path */
-               snprintf(pathname, MAXPGPATH, "pg_tblspc/%s/%s/%u",
++              snprintf(pathname, sizeof(pathname), "pg_tblspc/%s/%s_%s/%u",
 +                               direntry->d_name, TABLESPACE_VERSION_DIRECTORY, PGXCNodeName, dbOid);
 +#else
+               snprintf(pathname, sizeof(pathname), "pg_tblspc/%s/%s/%u",
                                 direntry->d_name, TABLESPACE_VERSION_DIRECTORY, dbOid);
 +#endif
                totalsize += db_dir_size(pathname);
        }
  
Simple merge
Simple merge
index 72d0e7ee4f4db8110e305ffd3e6fc98750d90873,9cc0b08e969b9950154243fdd22bdf9efa722cf0..af2fa19521f1b6da0bcc20a7b7d2c4fb5c292c63
  #include "utils/acl.h"
  #include "utils/builtins.h"
  #include "utils/timestamp.h"
 +#ifdef PGXC
 +#include "pgxc/pgxc.h"
 +#endif
  
- #define atooid(x)  ((Oid) strtoul((x), NULL, 10))
  
  /*
   * Common subroutine for num_nulls() and num_nonnulls().
index 94ee7e2d037b48217c756ce4e295191620f99dc1,be793539a3c84e60ff60e21bf07280403fa94f80..85fdfc9fb3471d6ed35c9b961a72dcf6c5e40a0c
@@@ -11,8 -11,7 +11,8 @@@
   * we do better?)
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -124,18 -89,11 +96,18 @@@ cstring_send(PG_FUNCTION_ARGS
  Datum
  anyarray_in(PG_FUNCTION_ARGS)
  {
 +#ifdef XCP
 +      /*
 +       * XCP version of array_in() understands prefix describing element type
 +       */
 +      return array_in(fcinfo);
 +#else
        ereport(ERROR,
                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                        errmsg("cannot accept a value of type anyarray")));
+                        errmsg("cannot accept a value of type %s", "anyarray")));
  
        PG_RETURN_VOID();                       /* keep compiler quiet */
 +#endif
  }
  
  /*
index eeec525fdaa03c24e8876d58216fde091d186f9c,37139f9647b465fa412bb734ecbe800a3c2a87c5..ee2c56bb2e0443389189406b0340b91f7452f629
@@@ -13,8 -13,7 +13,8 @@@
   *    plan --- consider improving this someday.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   *
   * src/backend/utils/adt/ri_triggers.c
   *
Simple merge
index 66e7553e5146f7a8a96dd72e5afc100b2589cb9e,824d7572faf43139b12e68fd2c8981290226d75b..2820dbe46513dbd77a44fecd3c12412cab0f55c6
@@@ -4,8 -4,7 +4,8 @@@
   *      Functions to convert stored expressions/querytrees back to
   *      source text
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -6312,181 -6353,6 +6689,180 @@@ get_utility_query_def(Query *query, dep
                        simple_quote_literal(buf, stmt->payload);
                }
        }
-                               Type type;
 +#ifdef PGXC
 +      else if (query->utilityStmt && IsA(query->utilityStmt, CreateStmt))
 +      {
 +              CreateStmt *stmt = (CreateStmt *) query->utilityStmt;
 +              ListCell   *column;
 +              const char *delimiter = "";
 +              RangeVar   *relation = stmt->relation;
 +              bool            istemp = (relation->relpersistence == RELPERSISTENCE_TEMP);
 +              bool            isunlogged = (relation->relpersistence == RELPERSISTENCE_UNLOGGED);
 +
 +              appendStringInfo(buf, "CREATE %s %s %s TABLE %s ",
 +                              stmt->islocal ? "LOCAL" : "",
 +                              istemp ? "TEMP" : "",
 +                              isunlogged ? "UNLOGGED" : "",
 +                              stmt->if_not_exists ? "IF NOT EXISTS " : "");
 +
 +              if (!istemp && relation->schemaname && relation->schemaname[0])
 +                      appendStringInfo(buf, "%s.", quote_identifier(relation->schemaname));
 +              appendStringInfo(buf, "%s", quote_identifier(relation->relname));
 +
 +              appendStringInfo(buf, "(");
 +              foreach(column, stmt->tableElts)
 +              {
 +                      Node *node = (Node *) lfirst(column);
 +
 +                      appendStringInfo(buf, "%s", delimiter);
 +                      delimiter = ", ";
 +
 +                      if (IsA(node, ColumnDef))
 +                      {
 +                              ColumnDef *coldef = (ColumnDef *) node;
 +                              TypeName *typename = coldef->typeName;
 +#ifdef XCP
 +                              appendStringInfo(buf, "%s %s",
 +                                                               quote_identifier(coldef->colname),
 +                                                               format_type_with_typemod(typename->typeOid,
 +                                                                                                                typename->typemod));
 +#else
 +
 +                              /* error out if we have no recourse at all */
 +                              if (!OidIsValid(typename->typeOid))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                       errmsg("improper type oid: \"%u\"", typename->typeOid)));
 +
 +                              /* get typename from the oid */
 +                              type = typeidType(typename->typeOid);
 +
 +                              if (!HeapTupleIsValid(type))
 +                                      ereport(ERROR,
 +                                                      (errcode(ERRCODE_UNDEFINED_OBJECT),
 +                                                       errmsg("type \"%u\" does not exist",
 +                                                               typename->typeOid)));
 +                              appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname),
 +                                              typeTypeName(type));
 +                              ReleaseSysCache(type);
 +#endif
 +                      }
 +                      else
 +                              elog(ERROR, "Invalid table column definition.");
 +              }
 +              appendStringInfo(buf, ")");
 +
 +              /* Append storage parameters, like for instance WITH (OIDS) */
 +              if (list_length(stmt->options) > 0)
 +              {
 +                      Datum        reloptions;
 +                      static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
 +
 +                      reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps,
 +                                                                               false, false);
 +
 +                      if (reloptions)
 +                      {
 +                              Datum   sep, txt;
 +                              /* Below is inspired from flatten_reloptions() */
 +                              sep = CStringGetTextDatum(", ");
 +                              txt = OidFunctionCall2(F_ARRAY_TO_TEXT, reloptions, sep);
 +                              appendStringInfo(buf, " WITH (%s)", TextDatumGetCString(txt));
 +                      }
 +              }
 +
 +              /* add the on commit clauses for temporary tables */
 +              switch (stmt->oncommit)
 +              {
 +                      case ONCOMMIT_NOOP:
 +                              /* do nothing */
 +                              break;
 +
 +                      case ONCOMMIT_PRESERVE_ROWS:
 +                              appendStringInfo(buf, " ON COMMIT PRESERVE ROWS");
 +                              break;
 +
 +                      case ONCOMMIT_DELETE_ROWS:
 +                              appendStringInfo(buf, " ON COMMIT DELETE ROWS");
 +                              break;
 +
 +                      case ONCOMMIT_DROP:
 +                              appendStringInfo(buf, " ON COMMIT DROP");
 +                              break;
 +              }
 +
 +              if (stmt->distributeby)
 +              {
 +                      /* add the on commit clauses for temporary tables */
 +                      switch (stmt->distributeby->disttype)
 +                      {
 +                              case DISTTYPE_REPLICATION:
 +                                      appendStringInfo(buf, " DISTRIBUTE BY REPLICATION");
 +                                      break;
 +
 +                              case DISTTYPE_HASH:
 +                                      appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", stmt->distributeby->colname);
 +                                      break;
 +
 +                              case DISTTYPE_ROUNDROBIN:
 +                                      appendStringInfo(buf, " DISTRIBUTE BY ROUNDROBIN");
 +                                      break;
 +
 +                              case DISTTYPE_MODULO:
 +                                      appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)",
 +                                                      quote_identifier(stmt->distributeby->colname));
 +                                      break;
 +
 +                              default:
 +                                      ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
 +                                                              errmsg("Invalid distribution type")));
 +
 +                      }
 +              }
 +
 +              if (stmt->subcluster)
 +              {
 +                      ListCell   *cell;
 +
 +                      switch (stmt->subcluster->clustertype)
 +                      {
 +                              case SUBCLUSTER_NODE:
 +                                      appendStringInfo(buf, " TO NODE (");
 +
 +                                      /* Add node members */
 +                                      Assert(stmt->subcluster->members);
 +                                      foreach(cell, stmt->subcluster->members)
 +                                      {
 +                                              appendStringInfo(buf, " %s",
 +                                                              quote_identifier(strVal(lfirst(cell))));
 +                                              if (cell->next)
 +                                                      appendStringInfo(buf, ",");
 +                                      }
 +                                      appendStringInfo(buf, ")");
 +                                      break;
 +
 +                              case SUBCLUSTER_GROUP:
 +                                      appendStringInfo(buf, " TO GROUP");
 +
 +                                      /* Add group members */
 +                                      Assert(stmt->subcluster->members);
 +                                      foreach(cell, stmt->subcluster->members)
 +                                      {
 +                                              appendStringInfo(buf, " %s",
 +                                                              quote_identifier(strVal(lfirst(cell))));
 +                                              if (cell->next)
 +                                                      appendStringInfo(buf, ",");
 +                                      }
 +                                      break;
 +
 +                              case SUBCLUSTER_NONE:
 +                              default:
 +                                      /* Nothing to do */
 +                                      break;
 +                      }
 +              }
 +      }
 +#endif
        else
        {
                /* Currently only NOTIFY utility commands can appear in rules */
Simple merge
Simple merge
Simple merge
index 2095a9dfe2c14ade02c3d857c5108c8970980071,5bdc8fad43307b9a1aa1f16cdf5381787ee089ba..180032877ad28671c84d0045cedc5865ee608698
@@@ -3,8 -3,7 +3,8 @@@
   * version.c
   *     Returns the PostgreSQL version string
   *
-  * Copyright (c) 1998-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 1998-2017, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
   *
index 99b6deb9cf8013336e4daec22d5a5f9f709ad6fb,819121638ea3e28e3a91e73e828c935ab48fe615..9daaf75d888420fc8b0df7831551d958441aed7b
@@@ -85,8 -85,7 +85,8 @@@
   *    problems can be overcome cheaply.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index d675081ed5bde06095f6d2c8a8b7d57553c12bb8,4def73ddfbe7ee0c7f407361896cb2a53c7bde48..7384b1971a4c74ea75b6055bcf92f5fe045241ae
@@@ -3,8 -3,7 +3,8 @@@
   * lsyscache.c
   *      Convenience routines for common queries in the system catalog cache.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -2362,275 -2316,6 +2394,293 @@@ getBaseTypeAndTypmod(Oid typid, int32 *
        return typid;
  }
  
 +#ifdef PGXC
 +/*
 + * get_typename
 + *            Get type name for given type ID
 + */
 +char *
 +get_typename(Oid typid)
 +{
 +      HeapTuple               tuple;
 +      Form_pg_type    typeForm;
 +      char               *result;
 +
 +      tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for type %u", typid);
 +
 +      typeForm = (Form_pg_type) GETSTRUCT(tuple);
 +      result = pstrdup(NameStr(typeForm->typname));
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 +/*
 + * get_pgxc_nodeoid
 + *            Obtain PGXC Node Oid for given node name
 + *            Return Invalid Oid if object does not exist
 + */
 +Oid
 +get_pgxc_nodeoid(const char *nodename)
 +{
 +      return GetSysCacheOid1(PGXCNODENAME,
 +                                                 PointerGetDatum(nodename));
 +}
 +
 +/*
 + * get_pgxc_nodename
 + *            Get node name for given Oid
 + */
 +char *
 +get_pgxc_nodename(Oid nodeid)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_node  nodeForm;
 +      char               *result;
 +
 +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
 +
 +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +      result = pstrdup(NameStr(nodeForm->node_name));
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 + /*
 + * get_pgxc_node_id
 + *            Get node identifier for a given Oid
 + */
 +uint32
 +get_pgxc_node_id(Oid nodeid)
 +{
 +      HeapTuple       tuple;
 +      Form_pgxc_node  nodeForm;
 +      uint32          result;
 +
 +      if (nodeid == InvalidOid)
 +              return 0;
 +
 +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
 +
 +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +      result = nodeForm->node_id;
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 +/*
 + * get_pgxc_nodetype
 + *            Get node type for given Oid
 + */
 +char
 +get_pgxc_nodetype(Oid nodeid)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_node  nodeForm;
 +      char                    result;
 +
 +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
 +
 +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +      result = nodeForm->node_type;
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 +/*
 + * get_pgxc_nodeport
 + *            Get node port for given Oid
 + */
 +int
 +get_pgxc_nodeport(Oid nodeid)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_node  nodeForm;
 +      int                             result;
 +
 +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
 +
 +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +      result = nodeForm->node_port;
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 +/*
 + * get_pgxc_nodehost
 + *            Get node host for given Oid
 + */
 +char *
 +get_pgxc_nodehost(Oid nodeid)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_node  nodeForm;
 +      char               *result;
 +
 +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
 +
 +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +      result = pstrdup(NameStr(nodeForm->node_host));
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 +/*
 + * is_pgxc_nodepreferred
 + *            Determine if node is a preferred one
 + */
 +bool
 +is_pgxc_nodepreferred(Oid nodeid)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_node  nodeForm;
 +      bool                    result;
 +
 +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
 +
 +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +      result = nodeForm->nodeis_preferred;
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 +/*
 + * is_pgxc_nodeprimary
 + *            Determine if node is a primary one
 + */
 +bool
 +is_pgxc_nodeprimary(Oid nodeid)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_node  nodeForm;
 +      bool                    result;
 +
 +      tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for node %u", nodeid);
 +
 +      nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
 +      result = nodeForm->nodeis_primary;
 +      ReleaseSysCache(tuple);
 +
 +      return result;
 +}
 +
 +/*
 + * get_pgxc_groupoid
 + *            Obtain PGXC Group Oid for given group name
 + *            Return Invalid Oid if group does not exist
 + */
 +Oid
 +get_pgxc_groupoid(const char *groupname)
 +{
 +      return GetSysCacheOid1(PGXCGROUPNAME,
 +                                                 PointerGetDatum(groupname));
 +}
 +
 +/*
 + * get_pgxc_groupmembers
 + *            Obtain PGXC Group members for given group Oid
 + *            Return number of members and their list
 + *
 + * Member list is returned as a palloc'd array
 + */
 +int
 +get_pgxc_groupmembers(Oid groupid, Oid **members)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_group         groupForm;
 +      int                     nmembers;
 +
 +      tuple = SearchSysCache1(PGXCGROUPOID, ObjectIdGetDatum(groupid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for group %u", groupid);
 +
 +      groupForm = (Form_pgxc_group) GETSTRUCT(tuple);
 +      nmembers = (int) groupForm->group_members.dim1;
 +      *members = (Oid *) palloc(nmembers * sizeof(Oid));
 +      memcpy(*members, groupForm->group_members.values, nmembers * sizeof(Oid));
 +
 +      ReleaseSysCache(tuple);
 +      return nmembers;
 +}
 +
++char *
++get_pgxc_groupname(Oid groupid)
++{
++      HeapTuple       tuple;
++      Form_pgxc_group     groupForm;
++      char            *result;
++
++      tuple = SearchSysCache1(PGXCGROUPOID,
++                      ObjectIdGetDatum(groupid));
++
++      if (!HeapTupleIsValid(tuple))
++              elog(ERROR, "cache lookup failed for group %u", groupid);
++
++      groupForm = (Form_pgxc_group) GETSTRUCT(tuple);
++      result = pstrdup(NameStr(groupForm->group_name));
++      ReleaseSysCache(tuple);
++      return result;
++}
 +/*
 + * get_pgxc_classnodes
 + *            Obtain PGXC class Datanode list for given relation Oid
 + *            Return number of Datanodes and their list
 + *
 + * Node list is returned as a palloc'd array
 + */
 +int
 +get_pgxc_classnodes(Oid tableid, Oid **nodes)
 +{
 +      HeapTuple               tuple;
 +      Form_pgxc_class         classForm;
 +      int                     numnodes;
 +
 +      tuple = SearchSysCache1(PGXCCLASSRELID, ObjectIdGetDatum(tableid));
 +
 +      if (!HeapTupleIsValid(tuple))
 +                      elog(ERROR, "cache lookup failed for relation %u", tableid);
 +
 +      classForm = (Form_pgxc_class) GETSTRUCT(tuple);
 +      numnodes = (int) classForm->nodeoids.dim1;
 +      *nodes = (Oid *) palloc(numnodes * sizeof(Oid));
 +      memcpy(*nodes, classForm->nodeoids.values, numnodes * sizeof(Oid));
 +
 +      ReleaseSysCache(tuple);
 +      return numnodes;
 +}
 +#endif
 +
  /*
   * get_typavgwidth
   *
index 652cdf188b0f36c6e36cd528be3cab50a11a8c30,4b5f8107ef096fc5bb0c7bdc82426df57aaa9306..61e3da9306031c67b445be750b961ba79cd190b8
@@@ -38,8 -38,7 +38,8 @@@
   * be infrequent enough that more-detailed tracking is not worth the effort.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -155,11 -149,8 +158,11 @@@ InitPlanCache(void
   * commandTag: compile-time-constant tag for query, or NULL if empty query
   */
  CachedPlanSource *
- CreateCachedPlan(Node *raw_parse_tree,
+ CreateCachedPlan(RawStmt *raw_parse_tree,
                                 const char *query_string,
 +#ifdef PGXC
 +                               const char *stmt_name,
 +#endif
                                 const char *commandTag)
  {
        CachedPlanSource *plansource;
@@@ -1749,13 -1674,9 +1754,12 @@@ PlanCacheComputeResultDesc(List *stmt_l
  
        switch (ChoosePortalStrategy(stmt_list))
        {
 +#ifdef XCP
 +              case PORTAL_DISTRIBUTED:
 +#endif
                case PORTAL_ONE_SELECT:
                case PORTAL_ONE_MOD_WITH:
-                       query = (Query *) linitial(stmt_list);
-                       Assert(IsA(query, Query));
+                       query = linitial_node(Query, stmt_list);
                        return ExecCleanTypeFromTL(query->targetList, false);
  
                case PORTAL_ONE_RETURNING:
index 9d3e19617603adfea6a87cbce43369f8a245f659,c2e8361f2f4413a20345d7a3584084fb1a8e65a9..02df47433965334e14664558a8f64e56be7123da
@@@ -3,10 -3,8 +3,10 @@@
   * relcache.c
   *      POSTGRES relation descriptor cache code
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
index 1f32c421078f83efd85363034414f0c96929683c,922718c9d17378d4fb230c775a5c7bf1c86e27c4..f18dbb31b0e35d2cbf29bc03020946b21ea40981
@@@ -3,9 -3,8 +3,9 @@@
   * syscache.c
   *      System cache management routines
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   *
   * IDENTIFICATION
@@@ -574,74 -573,17 +579,85 @@@ static const struct cachedesc cacheinfo
                },
                8
        },
 +#ifdef PGXC
 +      {PgxcClassRelationId,   /* PGXCCLASSRELID */
 +              PgxcClassPgxcRelIdIndexId,
 +              1,
 +              {
 +                      Anum_pgxc_class_pcrelid,
 +                      0,
 +                      0,
 +                      0
 +              },
 +              1024
 +      },
 +      {PgxcGroupRelationId,   /* PGXCGROUPNAME */
 +              PgxcGroupGroupNameIndexId,
 +              1,
 +              {
 +                      Anum_pgxc_group_name,
 +                      0,
 +                      0,
 +                      0
 +              },
 +              256
 +      },
 +      {PgxcGroupRelationId,   /* PGXCGROUPOID */
 +              PgxcGroupOidIndexId,
 +              1,
 +              {
 +                      ObjectIdAttributeNumber,
 +                      0,
 +                      0,
 +                      0
 +              },
 +              256
 +      },
 +      {PgxcNodeRelationId,    /* PGXCNODENAME */
 +              PgxcNodeNodeNameIndexId,
 +              1,
 +              {
 +                      Anum_pgxc_node_name,
 +                      0,
 +                      0,
 +                      0
 +              },
 +              256
 +      },
 +      {PgxcNodeRelationId,    /* PGXCNODEOID */
 +              PgxcNodeOidIndexId,
 +              1,
 +              {
 +                      ObjectIdAttributeNumber,
 +                      0,
 +                      0,
 +                      0
 +              },
 +              256
 +      },
 +      {PgxcNodeRelationId,    /* PGXCNODEIDENTIFIER */
 +              PgxcNodeNodeIdIndexId,
 +              1,
 +              {
 +                      Anum_pgxc_node_id,
 +                      0,
 +                      0,
 +                      0
 +              },
 +              256
 +      },
 +#endif
+       {PartitionedRelationId,         /* PARTRELID */
+               PartitionedRelidIndexId,
+               1,
+               {
+                       Anum_pg_partitioned_table_partrelid,
+                       0,
+                       0,
+                       0
+               },
+               32
+       },
        {ProcedureRelationId,           /* PROCNAMEARGSNSP */
                ProcedureNameArgsNspIndexId,
                3,
Simple merge
index 2da98efa33080725ebd28042beb52d3ba5e02133,22004cb81920ada33db7ab47e9dbea6776f94f80..35ed690931da39bfbe61afc8632ad9438556aac5
  #include "utils/guc.h"
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
 +#ifdef PGXC
 +#include "pgxc/pgxc.h"
 +#include "pgxc/execRemote.h"
 +#endif
  
  
+ /* In this module, access gettext() via err_gettext() */
  #undef _
  #define _(x) err_gettext(x)
  
- static const char *err_gettext(const char *str) pg_attribute_format_arg(1);
- static void set_errdata_field(MemoryContextData *cxt, char **ptr, const char *str);
 +#ifdef USE_MODULE_MSGIDS
 +static void AtProcExit_MsgModule(int code, Datum arg);
 +static bool pg_msgmodule_enable_disable(int32 pid, bool enable);
 +#endif
  
  /* Global variables */
  ErrorContextCallback *error_context_stack = NULL;
@@@ -188,41 -182,8 +192,37 @@@ static const char *useful_strerror(int 
  static const char *get_errno_symbol(int errnum);
  static const char *error_severity(int elevel);
  static void append_with_tabs(StringInfo buf, const char *str);
 -static bool is_log_level_output(int elevel, int log_min_level);
 +static bool is_log_level_output(int elevel,
 +#ifdef USE_MODULE_MSGIDS
 +              int moduleid,
 +              int fileid,
 +              int msgid,
 +#endif
 +              int log_min_level);
- static void write_pipe_chunks(char *data, int len, int dest);
- static void write_csvlog(ErrorData *edata);
- static void setup_formatted_log_time(void);
- static void setup_formatted_start_time(void);
  
 +#ifdef USE_MODULE_MSGIDS
 +typedef struct MsgModuleCtlStruct
 +{
 +      bool    mm_enabled;
 +      bool    mm_persistent;
 +      char    mm_flags[FLEXIBLE_ARRAY_MEMBER];
 +} MsgModuleCtlStruct;
 +
 +#define StartOfBackendFlags   \
 +      ( \
 +        PGXL_MSG_MAX_MODULES * \
 +        PGXL_MSG_MAX_FILEIDS_PER_MODULE * \
 +        PGXL_MSG_MAX_MSGIDS_PER_FILE \
 +      )
 +
 +#define SizeOfMsgModuleCtlStruct      \
 +      ( \
 +        offsetof(MsgModuleCtlStruct, mm_flags) + \
 +        StartOfBackendFlags + \
 +        MaxBackends \
 +      )
 +static MsgModuleCtlStruct *MsgModuleCtl;
 +#endif
  
  /*
   * in_error_recursion_trouble --- are we at risk of infinite error recursion?
@@@ -1674,18 -1615,15 +1677,19 @@@ ThrowErrorData(ErrorData *edata
        MemoryContext oldcontext;
  
        if (!errstart(edata->elevel, edata->filename, edata->lineno,
 +#ifdef USE_MODULE_MSGIDS
 +                              edata->moduleid,
 +                              edata->fileid, edata->msgid,
 +#endif
                                  edata->funcname, NULL))
-               return;
+               return;                                 /* error is not to be reported at all */
  
        newedata = &errordata[errordata_stack_depth];
-       oldcontext = MemoryContextSwitchTo(edata->assoc_context);
+       recursion_depth++;
+       oldcontext = MemoryContextSwitchTo(newedata->assoc_context);
  
-       /* Copy the supplied fields to the error stack. */
-       if (edata->sqlerrcode > 0)
+       /* Copy the supplied fields to the error stack entry. */
+       if (edata->sqlerrcode != 0)
                newedata->sqlerrcode = edata->sqlerrcode;
        if (edata->message)
                newedata->message = pstrdup(edata->message);
index 5cb9a138a544f5b7fcc74eef1d202c34f5d0fdb0,08b6030a649621d35b7b581a46233e94115002d8..b0ec4a2d279e33d4bb4bf66aea51405ad39a9208
@@@ -3,8 -3,7 +3,8 @@@
   * globals.c
   *      global variable declarations
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
index 1f59d7acf8bedb294c820a8e8778d895afb1d1b9,8d149bf2728cb5c9ee35d90e9481242facbbddda..d987dac8f508df248143ce467dcb974557d21d26
@@@ -3,8 -3,7 +3,8 @@@
   * miscinit.c
   *      miscellaneous initialization support stuff
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
  #include "libpq/libpq.h"
  #include "mb/pg_wchar.h"
  #include "miscadmin.h"
 +#ifdef XCP
 +#include "pgxc/execRemote.h"
 +#endif
+ #include "pgstat.h"
  #include "postmaster/autovacuum.h"
  #include "postmaster/postmaster.h"
  #include "storage/fd.h"
  #include "utils/builtins.h"
  #include "utils/guc.h"
  #include "utils/memutils.h"
 +#ifdef XCP
 +#include "utils/snapmgr.h"
 +#endif
  #include "utils/syscache.h"
 +#include "utils/lsyscache.h"
+ #include "utils/varlena.h"
  
  
  #define DIRECTORY_LOCK_FILE           "postmaster.pid"
index 2355321549e8b96b37ee7dae9c372cc40a07cb8e,b8b4a06350c50858c66a41f6d70b46e74303c0a4..778e7fc47274e6f91a68968d57b79088b5aef277
@@@ -3,8 -3,7 +3,8 @@@
   * postinit.c
   *      postgres initialization utilities
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@@ -356,11 -349,8 +354,11 @@@ CheckMyDatabase(const char *name, bool 
                 * just document that the connection limit is approximate.
                 */
                if (dbform->datconnlimit >= 0 &&
 +#ifdef XCP
 +                      IS_PGXC_COORDINATOR &&
 +#endif
                        !am_superuser &&
-                       CountDBBackends(MyDatabaseId) > dbform->datconnlimit)
+                       CountDBConnections(MyDatabaseId) > dbform->datconnlimit)
                        ereport(FATAL,
                                        (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
                                         errmsg("too many connections for database \"%s\"",
@@@ -674,8 -664,13 +672,13 @@@ InitPostgres(const char *in_dbname, Oi
        before_shmem_exit(ShutdownPostgres, 0);
  
        /* The autovacuum launcher is done here */
 -      if (IsAutoVacuumLauncherProcess())
 +      if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess())
+       {
+               /* report this backend in the PgBackendStatus array */
+               pgstat_bestart();
                return;
+       }
  
        /*
         * Start a new transaction here before first access to db, and get a
index cf1a41fa71e201b260ca2a4f59f17b7e2e30fdbe,92e1d63b2f5ec710b03639b3bbecd2f51d7bcc35..f7391cc6b8f4cb3c846164444eac57bc2ad120d4
@@@ -6,8 -6,7 +6,8 @@@
   * See src/backend/utils/misc/README for more information.
   *
   *
-  * Copyright (c) 2000-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 2000-2017, PostgreSQL Global Development Group
   * Written by Peter Eisentraut <[email protected]>.
   *
   * IDENTIFICATION
  
  #include "access/commit_ts.h"
  #include "access/gin.h"
 +#ifdef PGXC
 +#include "access/gtm.h"
 +#include "pgxc/pgxc.h"
 +#endif
+ #include "access/rmgr.h"
  #include "access/transam.h"
  #include "access/twophase.h"
  #include "access/xact.h"
  #include "parser/parser.h"
  #include "parser/scansup.h"
  #include "pgstat.h"
 +#ifdef PGXC
 +#include "commands/tablecmds.h"
 +#include "commands/trigger.h"
 +#include "nodes/nodes.h"
 +#include "pgxc/execRemote.h"
 +#include "pgxc/locator.h"
 +#include "pgxc/planner.h"
 +#include "pgxc/poolmgr.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/xc_maintenance_mode.h"
 +#include "storage/procarray.h"
 +#endif
 +#ifdef XCP
 +#include "commands/sequence.h"
 +#include "parser/parse_utilcmd.h"
 +#include "pgxc/nodemgr.h"
 +#include "pgxc/squeue.h"
 +#include "utils/snapmgr.h"
 +#endif
  #include "postmaster/autovacuum.h"
- #include "postmaster/bgworker.h"
+ #include "postmaster/bgworker_internals.h"
  #include "postmaster/bgwriter.h"
  #include "postmaster/postmaster.h"
  #include "postmaster/syslogger.h"
@@@ -956,47 -901,16 +980,57 @@@ static struct config_bool ConfigureName
                true,
                NULL, NULL, NULL
        },
 +#ifdef PGXC
 +      {
 +              {"enable_fast_query_shipping", PGC_USERSET, QUERY_TUNING_METHOD,
 +                      gettext_noop("Enables the planner's use of fast query shipping to ship query directly to datanode."),
 +                      NULL
 +              },
 +              &enable_fast_query_shipping,
 +              true,
 +              NULL, NULL, NULL
 +      },
 +      {
 +              {"loose_constraints", PGC_USERSET, COORDINATORS,
 +                      gettext_noop("Relax enforcing of constraints"),
 +                      gettext_noop("If enabled then constraints like foreign keys "
 +                                               "are not enforced. It's the users responsibility "
 +                                               "to maintain referential integrity at the application "
 +                                               "level")
 +              },
 +              &loose_constraints,
 +              false,
 +              NULL, NULL, NULL
 +      },
 +      {
 +              {"gtm_backup_barrier", PGC_SUSET, QUERY_TUNING_METHOD,
 +                      gettext_noop("Enables coordinator to report barrier id to GTM for backup."),
 +                      NULL
 +              },
 +              &gtm_backup_barrier,
 +              false,
 +              NULL, NULL, NULL
 +      },
 +      {
 +              {"enable_datanode_row_triggers", PGC_POSTMASTER, DEVELOPER_OPTIONS,
 +                      gettext_noop("Enables datanode-only ROW triggers"),
 +                      NULL
 +              },
 +              &enable_datanode_row_triggers,
 +              false,
 +              NULL, NULL, NULL
 +      },
 +#endif
+       {
+               {"enable_gathermerge", PGC_USERSET, QUERY_TUNING_METHOD,
+                       gettext_noop("Enables the planner's use of gather merge plans."),
+                       NULL
+               },
+               &enable_gathermerge,
+               true,
+               NULL, NULL, NULL
+       },
        {
                {"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
                        gettext_noop("Enables genetic query optimization."),
@@@ -3866,41 -3576,8 +3945,41 @@@ static struct config_string ConfigureNa
                check_TSCurrentConfig, assign_TSCurrentConfig, NULL
        },
  
 +#ifdef PGXC
 +      {
 +              {"gtm_host", PGC_POSTMASTER, GTM,
 +                      gettext_noop("Host name or address of GTM"),
 +                      NULL
 +              },
 +              &GtmHost,
 +              "localhost",
 +              NULL, NULL, NULL
 +      },
 +
 +      {
 +              {"pgxc_node_name", PGC_POSTMASTER, GTM,
 +                      gettext_noop("The Coordinator or Datanode name."),
 +                      NULL,
 +                      GUC_NO_RESET_ALL | GUC_IS_NAME
 +              },
 +              &PGXCNodeName,
 +              "",
 +              NULL, NULL, NULL
 +      },
 +#endif
 +#ifdef XCP
 +      {
 +              {"parentnode", PGC_BACKEND, CONN_AUTH,
 +                      gettext_noop("Sets the name of the parent data node"),
 +                      NULL
 +              },
 +              &parentPGXCNode,
 +              NULL,
 +              NULL, NULL, NULL
 +      },
 +#endif /* XCP */
        {
-               {"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY,
+               {"ssl_ciphers", PGC_SIGHUP, CONN_AUTH_SECURITY,
                        gettext_noop("Sets the list of allowed SSL ciphers."),
                        NULL,
                        GUC_SUPERUSER_ONLY
index 2163979637cdaffd5fd60341de4fdd53cec32f63,fceef14c78ae565c658f95688b1d3e2a6b1ae910..cd07343fd4490de02f0da52f3bf380e403ea78ff
  
  #effective_io_concurrency = 1         # 1-1000; 0 disables prefetching
  #max_worker_processes = 8             # (change requires restart)
- #max_parallel_workers_per_gather = 2  # taken from max_worker_processes
+ #max_parallel_workers_per_gather = 2  # taken from max_parallel_workers
+ #max_parallel_workers = 8         # maximum number of max_worker_processes that
+                                       # can be used in parallel queries
  #old_snapshot_threshold = -1          # 1min-60d; -1 disables; 0 is immediate
-                                                                       # (change requires restart)
- #backend_flush_after = 0              # 0 disables, default is 0
+                                       # (change requires restart)
+ #backend_flush_after = 0              # measured in pages, 0 disables
  
 +# - Shared queues -
 +
 +#shared_queues = 64                   # min 16   
 +#shared_queue_size = 64KB             # min 16KB
  
  #------------------------------------------------------------------------------
  # WRITE AHEAD LOG
  #cpu_tuple_cost = 0.01                        # same scale as above
  #cpu_index_tuple_cost = 0.005         # same scale as above
  #cpu_operator_cost = 0.0025           # same scale as above
 +#network_byte_cost = 0.001            # same scale as above
 +#remote_query_cost = 100.0            # same scale as above
  #parallel_tuple_cost = 0.1            # same scale as above
  #parallel_setup_cost = 1000.0 # same scale as above
- #min_parallel_relation_size = 8MB
+ #min_parallel_table_scan_size = 8MB
+ #min_parallel_index_scan_size = 512kB
  #effective_cache_size = 4GB
  
  # - Genetic Query Optimizer -
index 6b3f3dc7d9147ccf9ae659658a2cef86c92eefd3,6668bf135e9a81435c9a74bdec9898c0d5d6f67a..73c9fba2b1dbb0ecf1bd1da7117f2fd83cbc8a2e
@@@ -1189,26 -1094,16 +1098,40 @@@ pnstrdup(const char *in, Size len
        return out;
  }
  
 +#ifdef PGXC
 +#include "gen_alloc.h"
 +
 +void *current_memcontext(void);
 +
 +void *current_memcontext()
 +{
 +      return((void *)CurrentMemoryContext);
 +}
 +
 +void *allocTopCxt(size_t s)
 +{
 +      return MemoryContextAlloc(TopMemoryContext, (Size)s);
 +}
 +
 +Gen_Alloc genAlloc_class = {(void *)MemoryContextAlloc,
 +                                                      (void *)MemoryContextAllocZero,
 +                                                      (void *)repalloc,
 +                                                      (void *)pfree,
 +                                                      (void *)current_memcontext,
 +                                                      (void *)allocTopCxt};
 +
 +#endif
++
+ /*
+  * Make copy of string with all trailing newline characters removed.
+  */
+ char *
+ pchomp(const char *in)
+ {
+       size_t          n;
+       n = strlen(in);
+       while (n > 0 && in[n - 1] == '\n')
+               n--;
+       return pnstrdup(in, n);
+ }
index 776d2ae893b129664b965d165ff8707684d17b22,5983aedb121278dd13825b925c206643c2192ae4..62d96a01978e316f5823dc7a86da0e8b6dd1cb91
@@@ -8,8 -8,7 +8,8 @@@
   * doesn't actually run the executor for them.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index 73801c180b591adf36925e66744733b8ca141934,af46d781253e2834e0c330e7fc83776710e53847..4c3654f8096ed64e62c0a84f8be0b1b513bb296b
@@@ -679,19 -668,6 +679,17 @@@ ResourceOwnerReleaseInternal(ResourceOw
                                PrintFileLeakWarning(res);
                        FileClose(res);
                }
-               /* Clean up index scans too */
-               ReleaseResources_hash();
 +
 +              /* Ditto for prepared statements */
 +              while (ResourceArrayGetAny(&(owner->prepstmts), &foundres))
 +              {
 +                      char *stmt = (char *) DatumGetPointer(foundres);
 +
 +                      if (isCommit)
 +                              PrintPreparedStmtLeakWarning(stmt);
 +                      DropPreparedStatement(stmt, false);
 +              }
 +
        }
  
        /* Let add-on modules get a chance too */
index d2ba7d968c1eda99b813ea4880d1543ea8691e94,8a8db0fd337b874069054cfc841fe11048752eeb..e5bc08fff924a7b550789824b02767025d6ba2e3
   * code we determine the number of tapes M on the basis of workMem: we want
   * workMem/M to be large enough that we read a fair amount of data each time
   * we preread from a tape, so as to maintain the locality of access described
-  * above.  Nonetheless, with large workMem we can have many tapes.
+  * above.  Nonetheless, with large workMem we can have many tapes (but not
+  * too many -- see the comments in tuplesort_merge_order).
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -317,22 -324,6 +335,14 @@@ struct Tuplesortstat
        void            (*readtup) (Tuplesortstate *state, SortTuple *stup,
                                                                                int tapenum, unsigned int len);
  
-       /*
-        * Function to move a caller tuple.  This is usually implemented as a
-        * memmove() shim, but function may also perform additional fix-up of
-        * caller tuple where needed.  Batch memory support requires the movement
-        * of caller tuples from one location in memory to another.
-        */
-       void            (*movetup) (void *dest, void *src, unsigned int len);
 +#ifdef PGXC
 +      /*
 +       * Function to read length of next stored tuple.
 +       * Used as 'len' parameter for readtup function.
 +       */
 +      unsigned int (*getlen) (Tuplesortstate *state, int tapenum, bool eofOK);
 +#endif
 +
        /*
         * This array holds the tuples now in sort memory.  If we are in state
         * INITIAL, the tuples are in no particular order; if we are in state
  #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
  #define WRITETUP(state,tape,stup)     ((*(state)->writetup) (state, tape, stup))
  #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
- #define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len))
- #define LACKMEM(state)                ((state)->availMem < 0 && !(state)->batchUsed)
 +#ifdef PGXC
 +#define GETLEN(state,tape,eofOK) ((*(state)->getlen) (state, tape, eofOK))
 +#endif
+ #define LACKMEM(state)                ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
  #define USEMEM(state,amt)     ((state)->availMem -= (amt))
  #define FREEMEM(state,amt)    ((state)->availMem += (amt))
  
@@@ -604,13 -612,6 +634,12 @@@ static void writetup_heap(Tuplesortstat
                          SortTuple *stup);
  static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
                         int tapenum, unsigned int len);
- static void movetup_heap(void *dest, void *src, unsigned int len);
 +#ifdef PGXC
 +static unsigned int getlen_datanode(Tuplesortstate *state, int tapenum,
 +                              bool eofOK);
 +static void readtup_datanode(Tuplesortstate *state, SortTuple *stup,
 +                               int tapenum, unsigned int len);
 +#endif
  static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
                                   Tuplesortstate *state);
  static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
@@@ -792,10 -786,6 +814,9 @@@ tuplesort_begin_heap(TupleDesc tupDesc
        state->copytup = copytup_heap;
        state->writetup = writetup_heap;
        state->readtup = readtup_heap;
-       state->movetup = movetup_heap;
 +#ifdef PGXC
 +      state->getlen = getlen;
 +#endif
  
        state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
        state->abbrevNext = 10;
@@@ -868,10 -858,6 +889,9 @@@ tuplesort_begin_cluster(TupleDesc tupDe
        state->copytup = copytup_cluster;
        state->writetup = writetup_cluster;
        state->readtup = readtup_cluster;
-       state->movetup = movetup_cluster;
 +#ifdef PGXC
 +      state->getlen = getlen;
 +#endif
        state->abbrevNext = 10;
  
        state->indexInfo = BuildIndexInfo(indexRel);
@@@ -963,10 -949,6 +983,9 @@@ tuplesort_begin_index_btree(Relation he
        state->copytup = copytup_index;
        state->writetup = writetup_index;
        state->readtup = readtup_index;
-       state->movetup = movetup_index;
 +#ifdef PGXC
 +      state->getlen = getlen;
 +#endif
        state->abbrevNext = 10;
  
        state->heapRel = heapRel;
@@@ -1034,10 -1021,6 +1058,9 @@@ tuplesort_begin_index_hash(Relation hea
        state->copytup = copytup_index;
        state->writetup = writetup_index;
        state->readtup = readtup_index;
-       state->movetup = movetup_index;
 +#ifdef PGXC
 +      state->getlen = getlen;
 +#endif
  
        state->heapRel = heapRel;
        state->indexRel = indexRel;
@@@ -1080,10 -1065,6 +1105,9 @@@ tuplesort_begin_datum(Oid datumType, Oi
        state->copytup = copytup_datum;
        state->writetup = writetup_datum;
        state->readtup = readtup_datum;
-       state->movetup = movetup_datum;
 +#ifdef PGXC
 +      state->getlen = getlen;
 +#endif
        state->abbrevNext = 10;
  
        state->datumType = datumType;
        return state;
  }
  
-       state->batchUsed = false;
 +#ifdef PGXC
 +/*
 + * Tuples are coming from source where they are already sorted.
 + * It is pretty much like sorting heap tuples but no need to load sorter.
 + * Sorter initial status is final merge, and correct readtup and getlen
 + * callbacks should be passed in.
 + * Usage pattern of the merge sorter
 + * tuplesort_begin_merge
 + * while (tuple = tuplesort_gettuple())
 + * {
 + *     // process
 + * }
 + * tuplesort_end_merge
 + */
 +Tuplesortstate *
 +tuplesort_begin_merge(TupleDesc tupDesc,
 +                                       int nkeys, AttrNumber *attNums,
 +                                       Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags,
 +                                       ResponseCombiner *combiner,
 +                                       int workMem)
 +{
 +      Tuplesortstate *state = tuplesort_begin_common(workMem, false);
 +      MemoryContext oldcontext;
 +      int                     i;
 +
 +      oldcontext = MemoryContextSwitchTo(state->sortcontext);
 +
 +      AssertArg(nkeys > 0);
 +      AssertArg(combiner);
 +
 +#ifdef TRACE_SORT
 +      if (trace_sort)
 +              elog(LOG,
 +                       "begin merge sort: nkeys = %d, workMem = %d", nkeys, workMem);
 +#endif
 +
 +      state->nKeys = nkeys;
 +
 +      TRACE_POSTGRESQL_SORT_START(MERGE_SORT,
 +                                                              false,  /* no unique check */
 +                                                              nkeys,
 +                                                              workMem,
 +                                                              false);
 +
 +      state->combiner = combiner;
 +      state->comparetup = comparetup_heap;
 +      state->copytup = NULL;
 +      state->writetup = NULL;
 +      state->readtup = readtup_datanode;
 +      state->getlen = getlen_datanode;
 +
 +      state->tuples = false;
-       /*
-        * logical tape in this case is a sorted stream
-        */
-       state->maxTapes = combiner->conn_count;
-       state->tapeRange = combiner->conn_count;
-       state->mergeactive = (bool *) palloc0(combiner->conn_count * sizeof(bool));
-       state->mergenext = (int *) palloc0(combiner->conn_count * sizeof(int));
-       state->mergelast = (int *) palloc0(combiner->conn_count * sizeof(int));
-       state->mergeavailslots = (int *) palloc0(combiner->conn_count * sizeof(int));
-       state->mergeavailmem = (int64 *) palloc0(combiner->conn_count * sizeof(int64));
-       state->mergetuples = (char **) palloc0(combiner->conn_count * sizeof(char *));
-       state->mergecurrent = (char **) palloc0(combiner->conn_count * sizeof(char *));
-       state->mergetail = (char **) palloc0(combiner->conn_count * sizeof(char *));
-       state->mergeoverflow = (char **) palloc0(combiner->conn_count * sizeof(char *));
 +
 +      state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
 +      state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
 +
 +      for (i = 0; i < nkeys; i++)
 +      {
 +              SortSupport sortKey = state->sortKeys + i;
 +
 +              AssertArg(attNums[i] != 0);
 +              AssertArg(sortOperators[i] != 0);
 +
 +              sortKey->ssup_cxt = CurrentMemoryContext;
 +              sortKey->ssup_collation = sortCollations[i];
 +              sortKey->ssup_nulls_first = nullsFirstFlags[i];
 +              sortKey->ssup_attno = attNums[i];
 +
 +              PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
 +      }
 +
-       beginmerge(state, state->tuples);
 +      state->tp_runs = (int *) palloc0(combiner->conn_count * sizeof(int));
 +      state->tp_dummy = (int *) palloc0(combiner->conn_count * sizeof(int));
 +      state->tp_tapenum = (int *) palloc0(combiner->conn_count * sizeof(int));
 +      /* mark each stream (tape) has one run */
 +      for (i = 0; i < combiner->conn_count; i++)
 +      {
 +              state->tp_runs[i] = 1;
 +              state->tp_tapenum[i] = i;
 +      }
++      beginmerge(state);
 +      state->status = TSS_FINALMERGE;
 +
 +      MemoryContextSwitchTo(oldcontext);
 +
 +      return state;
 +}
 +#endif
 +
  /*
   * tuplesort_set_bound
   *
@@@ -3243,169 -2898,27 +3030,31 @@@ beginmerge(Tuplesortstate *state
  }
  
  /*
-  * mergeprereadone - load tuples from one merge input tape
+  * mergereadnext - read next tuple from one merge input tape
   *
-  * Read tuples from the specified tape until it has used up its free memory
-  * or array slots; but ensure that we have at least one tuple, if any are
-  * to be had.
+  * Returns false on EOF.
   */
- static void
- mergeprereadone(Tuplesortstate *state, int srcTape)
+ static bool
+ mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
  {
        unsigned int tuplen;
-       SortTuple       stup;
-       int                     tupIndex;
-       int64           priorAvail,
-                               spaceUsed;
  
        if (!state->mergeactive[srcTape])
-               return;                                 /* tape's run is already exhausted */
-       /*
-        * Manage per-tape availMem.  Only actually matters when batch memory not
-        * in use.
-        */
-       priorAvail = state->availMem;
-       state->availMem = state->mergeavailmem[srcTape];
+               return false;                   /* tape's run is already exhausted */
  
-       /*
-        * When batch memory is used if final on-the-fly merge, only mergeoverflow
-        * test is relevant; otherwise, only LACKMEM() test is relevant.
-        */
-       while ((state->mergeavailslots[srcTape] > 0 &&
-                       state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) ||
-                  state->mergenext[srcTape] == 0)
-       {
-               /* read next tuple, if any */
+       /* read next tuple, if any */
 +#ifdef PGXC
 +              if ((tuplen = GETLEN(state, srcTape, true)) == 0)
 +#else
-               if ((tuplen = getlen(state, srcTape, true)) == 0)
+       if ((tuplen = getlen(state, srcTape, true)) == 0)
 +#endif
-               {
-                       state->mergeactive[srcTape] = false;
-                       break;
-               }
-               READTUP(state, &stup, srcTape, tuplen);
-               /* find a free slot in memtuples[] for it */
-               tupIndex = state->mergefreelist;
-               if (tupIndex)
-                       state->mergefreelist = state->memtuples[tupIndex].tupindex;
-               else
-               {
-                       tupIndex = state->mergefirstfree++;
-                       Assert(tupIndex < state->memtupsize);
-               }
-               state->mergeavailslots[srcTape]--;
-               /* store tuple, append to list for its tape */
-               stup.tupindex = 0;
-               state->memtuples[tupIndex] = stup;
-               if (state->mergelast[srcTape])
-                       state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex;
-               else
-                       state->mergenext[srcTape] = tupIndex;
-               state->mergelast[srcTape] = tupIndex;
+       {
+               state->mergeactive[srcTape] = false;
+               return false;
        }
-       /* update per-tape and global availmem counts */
-       spaceUsed = state->mergeavailmem[srcTape] - state->availMem;
-       state->mergeavailmem[srcTape] = state->availMem;
-       state->availMem = priorAvail - spaceUsed;
+       READTUP(state, stup, srcTape, tuplen);
+       return true;
  }
  
  /*
@@@ -4263,60 -3789,6 +3925,54 @@@ readtup_heap(Tuplesortstate *state, Sor
                                                                &stup->isnull1);
  }
  
- static void
- movetup_heap(void *dest, void *src, unsigned int len)
- {
-       memmove(dest, src, len);
- }
 +#ifdef PGXC
 +static unsigned int
 +getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK)
 +{
 +      ResponseCombiner *combiner = state->combiner;
 +      TupleTableSlot   *dstslot = combiner->ss.ps.ps_ResultTupleSlot;
 +      TupleTableSlot   *slot;
 +
 +      combiner->current_conn = tapenum;
 +      slot = FetchTuple(combiner);
 +      if (TupIsNull(slot))
 +      {
 +              if (eofOK)
 +                      return 0;
 +              else
 +                      elog(ERROR, "unexpected end of data");
 +      }
 +
 +      if (slot != dstslot)
 +              ExecCopySlot(dstslot, slot);
 +
 +      return 1;
 +}
 +
 +static void
 +readtup_datanode(Tuplesortstate *state, SortTuple *stup,
 +                               int tapenum, unsigned int len)
 +{
 +      TupleTableSlot *slot = state->combiner->ss.ps.ps_ResultTupleSlot;
 +      MinimalTuple tuple;
 +      HeapTupleData htup;
 +
 +      Assert(!TupIsNull(slot));
 +
 +      /* copy the tuple into sort storage */
 +      tuple = ExecCopySlotMinimalTuple(slot);
 +      stup->tuple = (void *) tuple;
 +      USEMEM(state, GetMemoryChunkSpace(tuple));
 +      /* set up first-column key value */
 +      htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
 +      htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
 +      stup->datum1 = heap_getattr(&htup,
 +                                                              state->sortKeys[0].ssup_attno,
 +                                                              state->tupDesc,
 +                                                              &stup->isnull1);
 +}
 +#endif /* PGXC */
 +
  /*
   * Routines specialized for the CLUSTER case (HeapTuple data, with
   * comparisons per a btree index definition)
index 24b51bf28b346e94551d14455e0e0d4f30790acf,b3f6be74573807308bf6022c762238266c66e454..9cbce9e59894e35a39cab610e3d42168211afdc9
@@@ -43,8 -43,7 +43,8 @@@
   * before switching to the other state or activating a different read pointer.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -134,11 -109,9 +134,12 @@@ struct Tuplestorestat
        bool            truncated;              /* tuplestore_trim has removed tuples? */
        int64           availMem;               /* remaining memory available, in bytes */
        int64           allowedMem;             /* total memory allowed, in bytes */
+       int64           tuples;                 /* number of tuples added */
        BufFile    *myfile;                     /* underlying file, or NULL if none */
        MemoryContext context;          /* memory context for holding tuples */
 +#ifdef XCP
 +      MemoryContext tmpcxt;           /* memory context for holding temporary data */
 +#endif
        ResourceOwner resowner;         /* resowner for holding temp files */
  
        /*
@@@ -842,8 -768,7 +857,9 @@@ tuplestore_puttuple_common(Tuplestorest
        int                     i;
        ResourceOwner oldowner;
  
 +      if (state->stat_name)
 +              state->stat_write_count++;
+       state->tuples++;
  
        switch (state->status)
        {
index 6923149fab380bb1f906029e9ff62b9d65049691,baff998641a64b274636b54c8d41b50a93c6ac67..e72547e8794c1804d89e9f165575449b36eaaf60
@@@ -30,8 -30,7 +30,8 @@@
   * destroyed at the end of each transaction.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
index ff7362cb49cc921ad45fc01abea146ecd7848e2a,b3d4fe3ae2a9f682883fabace1821e8537b45a9e..f89d635162254025b77196ed68799cd90d1769ed
   * transaction).
   *
   * These arrangements let us reset MyPgXact->xmin when there are no snapshots
-  * referenced by this transaction.  (One possible improvement would be to be
-  * able to advance Xmin when the snapshot with the earliest Xmin is no longer
-  * referenced.  That's a bit harder though, it requires more locking, and
-  * anyway it should be rather uncommon to keep temporary snapshots referenced
-  * for too long.)
+  * referenced by this transaction, and advance it when the one with oldest
+  * Xmin is no longer referenced.  For simplicity however, only registered
+  * snapshots not active snapshots participate in tracking which one is oldest;
+  * we don't try to change MyPgXact->xmin except when the active-snapshot
+  * stack is empty.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
@@@ -348,48 -349,19 +352,45 @@@ GetTransactionSnapshot(void
                        pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
                }
                else
 -                      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
 +                      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData, false);
  
-               /* Don't allow catalog snapshot to be older than xact snapshot. */
-               CatalogSnapshotStale = true;
                FirstSnapshotSet = true;
                return CurrentSnapshot;
        }
  
        if (IsolationUsesXactSnapshot())
 +      {
 +#ifdef PGXC
 +              /*
 +               * Consider this test case taken from portals.sql
 +               *
 +               * CREATE TABLE cursor (a int, b int) distribute by replication;
 +               * INSERT INTO cursor VALUES (10);
 +               * BEGIN;
 +               * SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
 +               * DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
 +               * INSERT INTO cursor VALUES (2);
 +               * FETCH ALL FROM c1;
 +               * would result in
 +               * ERROR:  attempted to lock invisible tuple
 +               * because FETCH would be sent as a select to the remote nodes
 +               * with command id 0, whereas the command id would be 2
 +               * in the current snapshot.
 +               * (1 sent by Coordinator due to declare cursor &
 +               *  2 because of the insert inside the transaction)
 +               * The command id should therefore be updated in the
 +               * current snapshot.
 +               */
 +              if (IsConnFromCoord() || IsConnFromDatanode())
 +                      SnapshotSetCommandId(GetCurrentCommandId(false));
 +#endif
                return CurrentSnapshot;
 +      }
  
        /* Don't allow catalog snapshot to be older than xact snapshot. */
-       CatalogSnapshotStale = true;
+       InvalidateCatalogSnapshot();
  
 -      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
 +      CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData, false);
  
        return CurrentSnapshot;
  }
@@@ -492,20 -464,29 +493,29 @@@ GetNonHistoricCatalogSnapshot(Oid relid
         * scan a relation for which neither catcache nor snapshot invalidations
         * are sent, we must refresh the snapshot every time.
         */
-       if (!CatalogSnapshotStale && !RelationInvalidatesSnapshotsOnly(relid) &&
+       if (CatalogSnapshot &&
+               !RelationInvalidatesSnapshotsOnly(relid) &&
                !RelationHasSysCache(relid))
-               CatalogSnapshotStale = true;
+               InvalidateCatalogSnapshot();
  
-       if (CatalogSnapshotStale)
+       if (CatalogSnapshot == NULL)
        {
                /* Get new snapshot. */
 -              CatalogSnapshot = GetSnapshotData(&CatalogSnapshotData);
 +              CatalogSnapshot = GetSnapshotData(&CatalogSnapshotData, true);
  
                /*
-                * Mark new snapshost as valid.  We must do this last, in case an
-                * ERROR occurs inside GetSnapshotData().
+                * Make sure the catalog snapshot will be accounted for in decisions
+                * about advancing PGXACT->xmin.  We could apply RegisterSnapshot, but
+                * that would result in making a physical copy, which is overkill; and
+                * it would also create a dependency on some resource owner, which we
+                * do not want for reasons explained at the head of this file. Instead
+                * just shove the CatalogSnapshot into the pairing heap manually. This
+                * has to be reversed in InvalidateCatalogSnapshot, of course.
+                *
+                * NB: it had better be impossible for this to throw error, since the
+                * CatalogSnapshot pointer is already valid.
                 */
-               CatalogSnapshotStale = false;
+               pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node);
        }
  
        return CatalogSnapshot;
Simple merge
Simple merge
index df3561cc66af40ba2178a598c1c4ea6fa2ba33e4,cd2f4b66d00c0c8d76358b3822afb346f32af7e2..6399d92f64b9c778014123c8d2f2717ed21b3c15
@@@ -38,8 -38,7 +38,8 @@@
   *
   * This code is released under the terms of the PostgreSQL License.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/bin/initdb/initdb.c
@@@ -265,12 -245,9 +256,12 @@@ static void test_config_settings(void)
  static void setup_config(void);
  static void bootstrap_template1(void);
  static void setup_auth(FILE *cmdfd);
- static void get_set_pwd(FILE *cmdfd);
+ static void get_su_pwd(void);
  static void setup_depend(FILE *cmdfd);
  static void setup_sysviews(FILE *cmdfd);
 +#ifdef PGXC
 +static void setup_nodeself(FILE *cmdfd);
 +#endif
  static void setup_description(FILE *cmdfd);
  static void setup_collation(FILE *cmdfd);
  static void setup_conversion(FILE *cmdfd);
@@@ -2057,14 -1701,15 +1742,18 @@@ setup_privileges(FILE *cmdfd
                "  SET relacl = (SELECT array_agg(a.acl) FROM "
                " (SELECT E'=r/\"$POSTGRES_SUPERUSERNAME\"' as acl "
                "  UNION SELECT unnest(pg_catalog.acldefault("
-               "    CASE WHEN relkind = 'S' THEN 's' ELSE 'r' END::\"char\",10::oid))"
+               "    CASE WHEN relkind = " CppAsString2(RELKIND_SEQUENCE) " THEN 's' "
+               "         ELSE 'r' END::\"char\"," CppAsString2(BOOTSTRAP_SUPERUSERID) "::oid))"
                " ) as a) "
-               "  WHERE relkind IN ('r', 'v', 'm', 'S') AND relacl IS NULL;\n\n",
+               "  WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", "
+               CppAsString2(RELKIND_VIEW) ", " CppAsString2(RELKIND_MATVIEW) ", "
+               CppAsString2(RELKIND_SEQUENCE) ")"
+               "  AND relacl IS NULL;\n\n",
                "GRANT USAGE ON SCHEMA pg_catalog TO PUBLIC;\n\n",
                "GRANT CREATE, USAGE ON SCHEMA public TO PUBLIC;\n\n",
 +#ifdef XCP
 +        "GRANT USAGE ON SCHEMA storm_catalog TO PUBLIC;\n",
 +#endif
                "REVOKE ALL ON pg_largeobject FROM PUBLIC;\n\n",
                "INSERT INTO pg_init_privs "
                "  (objoid, classoid, objsubid, initprivs, privtype)"
@@@ -3437,14 -2959,13 +3058,16 @@@ main(int argc, char *argv[]
                {"version", no_argument, NULL, 'V'},
                {"debug", no_argument, NULL, 'd'},
                {"show", no_argument, NULL, 's'},
-               {"noclean", no_argument, NULL, 'n'},
-               {"nosync", no_argument, NULL, 'N'},
+               {"noclean", no_argument, NULL, 'n'},    /* for backwards compatibility */
+               {"no-clean", no_argument, NULL, 'n'},
+               {"nosync", no_argument, NULL, 'N'},             /* for backwards compatibility */
+               {"no-sync", no_argument, NULL, 'N'},
                {"sync-only", no_argument, NULL, 'S'},
-               {"xlogdir", required_argument, NULL, 'X'},
+               {"waldir", required_argument, NULL, 'X'},
                {"data-checksums", no_argument, NULL, 'k'},
 +#ifdef PGXC
 +              {"nodename", required_argument, NULL, 12},
 +#endif
                {NULL, 0, NULL, 0}
        };
  
        if (authwarning != NULL)
                fprintf(stderr, "%s", authwarning);
  
-       /* Get directory specification used to start this executable */
-       strlcpy(bin_dir, argv[0], sizeof(bin_dir));
-       get_parent_directory(bin_dir);
+       /*
+        * Build up a shell command to tell the user how to start the server
+        */
+       start_db_cmd = createPQExpBuffer();
+       /* Get directory specification used to start initdb ... */
+       strlcpy(pg_ctl_path, argv[0], sizeof(pg_ctl_path));
+       canonicalize_path(pg_ctl_path);
+       get_parent_directory(pg_ctl_path);
+       /* ... and tag on pg_ctl instead */
+       join_path_components(pg_ctl_path, pg_ctl_path, "pg_ctl");
+       /* path to pg_ctl, properly quoted */
+       appendShellString(start_db_cmd, pg_ctl_path);
+       /* add -D switch, with properly quoted data directory */
+       appendPQExpBufferStr(start_db_cmd, " -D ");
+       appendShellString(start_db_cmd, pgdata_native);
+       /* add suggested -l switch and "start" command */
+       /* translator: This is a placeholder in a shell command. */
+       appendPQExpBuffer(start_db_cmd, " -l %s start", _("logfile"));
  
-                       printf(_("You can now start the database server of the Postgres-XL coordinator using:\n\n"
-                                               "    %s%s%spostgres%s --coordinator -D %s%s%s\n"
 +
 +#ifdef PGXC
 +      printf(_("\nSuccess.\n"));
 +      {
 +              char *pgxc_ctl_silent = getenv("PGXC_CTL_SILENT");
 +              if (!pgxc_ctl_silent || !strlen(pgxc_ctl_silent))
 +              {
-                                               "    %s%s%spg_ctl%s start -D %s%s%s -Z coordinator -l logfile\n\n"
++                      printf(_("\nSuccess. You can now start the database server of the Postgres-XL coordinator using:\n\n"
++                                              "    %s -Z coordinator\n\n"
 +                                              "or\n"
-                                               "    %s%s%spostgres%s --datanode -D %s%s%s\n"
-                                               "or \n"
-                                               "    %s%s%spg_ctl%s start -D %s%s%s -Z datanode -l logfile\n\n"),
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH,
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH,
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH,
-                                       QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                                       QUOTE_PATH, pgdata_native, QUOTE_PATH);
 +                                              " You can now start the database server of the Postgres-XL datanode using:\n\n"
++                                              "    %s -Z datanode\n\n"),
++                                      start_db_cmd->data,
++                                      start_db_cmd->data);
 +              }
 +      }
 +#else
        printf(_("\nSuccess. You can now start the database server using:\n\n"
-                        "    %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"),
-          QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
-                  QUOTE_PATH, pgdata_native, QUOTE_PATH);
+                        "    %s\n\n"),
+                  start_db_cmd->data);
 +#endif
  
+       destroyPQExpBuffer(start_db_cmd);
        return 0;
  }
index b15994b246c1fd9fbe9d6dd01f2afd1d30b839ec,8387a0b08056872a619dd3229c36a10e66c4699a..8043d326b39324aae3214e3f4368b647d0d961f1
@@@ -1922,24 -1932,19 +1946,19 @@@ do_help(void
  {
        printf(_("%s is a utility to initialize, start, stop, or control a PostgreSQL server.\n\n"), progname);
        printf(_("Usage:\n"));
-       printf(_("  %s init[db]               [-D DATADIR] [-s] [-o \"OPTIONS\"]\n"), progname);
- #ifdef PGXC
-       printf(_("  %s start   [-w] [-t SECS] [-Z NODE-TYPE] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
-       printf(_("  %s restart [-w] [-t SECS] [-Z NODE-TYPE] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
-                        "                 [-o \"OPTIONS\"]\n"), progname);
- #else
-       printf(_("  %s start   [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
-       printf(_("  %s restart [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
-                        "                 [-o \"OPTIONS\"]\n"), progname);
- #endif
-       printf(_("  %s stop    [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
-       printf(_("  %s reload  [-D DATADIR] [-s]\n"), progname);
-       printf(_("  %s status  [-D DATADIR]\n"), progname);
-       printf(_("  %s promote [-D DATADIR] [-s]\n"), progname);
-       printf(_("  %s kill    SIGNALNAME PID\n"), progname);
+       printf(_("  %s init[db] [-D DATADIR] [-s] [-o OPTIONS]\n"), progname);
 -      printf(_("  %s start    [-D DATADIR] [-l FILENAME] [-W] [-t SECS] [-s]\n"
++      printf(_("  %s start    [-D DATADIR] [-Z NODE-TYPE] [-l FILENAME] [-W] [-t SECS] [-s]\n"
+                        "                  [-o OPTIONS] [-p PATH] [-c]\n"), progname);
+       printf(_("  %s stop     [-D DATADIR] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"), progname);
 -      printf(_("  %s restart  [-D DATADIR] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"
++      printf(_("  %s restart  [-D DATADIR] [-Z NODE-TYPE] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"
+                        "                  [-o OPTIONS] [-c]\n"), progname);
+       printf(_("  %s reload   [-D DATADIR] [-s]\n"), progname);
+       printf(_("  %s status   [-D DATADIR]\n"), progname);
+       printf(_("  %s promote  [-D DATADIR] [-W] [-t SECS] [-s]\n"), progname);
+       printf(_("  %s kill     SIGNALNAME PID\n"), progname);
  #ifdef WIN32
-       printf(_("  %s register   [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
-                        "                    [-S START-TYPE] [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname);
+       printf(_("  %s register [-D DATADIR] [-N SERVICENAME] [-U USERNAME] [-P PASSWORD]\n"
+                        "                  [-S START-TYPE] [-e SOURCE] [-W] [-t SECS] [-s] [-o OPTIONS]\n"), progname);
        printf(_("  %s unregister [-N SERVICENAME]\n"), progname);
  #endif
  
        printf(_("  -s, --silent           only print errors, no informational messages\n"));
        printf(_("  -t, --timeout=SECS     seconds to wait when using -w option\n"));
        printf(_("  -V, --version          output version information, then exit\n"));
-       printf(_("  -w                     wait until operation completes\n"));
-       printf(_("  -W                     do not wait until operation completes\n"));
- #ifdef PGXC
 +      printf(_("  -Z NODE-TYPE           can be \"coordinator\" or \"datanode\" (Postgres-XL)\n"));
- #endif
+       printf(_("  -w, --wait             wait until operation completes (default)\n"));
+       printf(_("  -W, --no-wait          do not wait until operation completes\n"));
        printf(_("  -?, --help             show this help, then exit\n"));
-       printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n"));
        printf(_("If the -D option is omitted, the environment variable PGDATA is used.\n"));
  
        printf(_("\nOptions for start or restart:\n"));
@@@ -2223,11 -2242,8 +2261,8 @@@ main(int argc, char **argv
        /* process command-line options */
        while (optind < argc)
        {
- #ifdef PGXC
-               while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wWZ:", long_options, &option_index)) != -1)
- #else
-               while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wW", long_options, &option_index)) != -1)
- #endif
 -              while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wW",
++              while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wWZ:",
+                                                               long_options, &option_index)) != -1)
                {
                        switch (c)
                        {
Simple merge
Simple merge
index a527bfca5d94cb5c2a668e4caa5aae30e1f52307,9941111cda06444b7f6207d0357b417d8be893b8..a95a2f5fb35f13af7d875f79cc4d49305a2811e2
@@@ -4,8 -4,7 +4,8 @@@
   *      pg_dump is a utility for dumping out a postgres database
   *      into a script file.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *    pg_dump will read the system catalogs in a database and dump out a
@@@ -354,10 -355,9 +360,9 @@@ main(int argc, char **argv
                {"no-security-labels", no_argument, &dopt.no_security_labels, 1},
                {"no-synchronized-snapshots", no_argument, &dopt.no_synchronized_snapshots, 1},
                {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1},
- #ifdef PGXC
+               {"no-subscriptions", no_argument, &dopt.no_subscriptions, 1},
+               {"no-sync", no_argument, NULL, 7},
 -
 +              {"include-nodes", no_argument, &include_nodes, 1},
- #endif
                {NULL, 0, NULL, 0}
        };
  
@@@ -1078,22 -1101,14 +1113,18 @@@ setup_connection(Archive *AH, const cha
                else
                        ExecuteSqlStatement(AH,
                                                                "SET TRANSACTION ISOLATION LEVEL "
 -                                                              "REPEATABLE READ, READ ONLY");
 +                                                              "REPEATABLE READ"
 +#ifndef XCP
 +                                                              ", READ ONLY"
 +#endif
 +                                                              );
        }
-       else if (AH->remoteVersion >= 70400)
+       else
        {
-               /* note: comma was not accepted in SET TRANSACTION before 8.0 */
                ExecuteSqlStatement(AH,
                                                        "SET TRANSACTION ISOLATION LEVEL "
-                                                       "SERIALIZABLE READ ONLY");
+                                                       "SERIALIZABLE, READ ONLY");
        }
-       else
-               ExecuteSqlStatement(AH,
-                                                       "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE");
  
        /*
         * If user specified a snapshot to use, select that.  In a parallel dump
@@@ -1383,12 -1391,13 +1407,16 @@@ selectDumpableNamespace(NamespaceInfo *
                 * initdb time, see pg_init_privs).
                 */
                nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_ACL;
+       }
        else if (strncmp(nsinfo->dobj.name, "pg_", 3) == 0 ||
 +#ifdef XCP
 +                       strncmp(nsinfo->dobj.name, "storm_", 6) == 0 ||
 +#endif
                         strcmp(nsinfo->dobj.name, "information_schema") == 0)
+       {
+               /* Other system schemas don't get dumped */
                nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_NONE;
+       }
        else
                nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_ALL;
  
@@@ -5329,11 -5658,7 +5685,12 @@@ getTables(Archive *fout, int *numTables
                                                  initacl_subquery->data,
                                                  initracl_subquery->data,
                                                  username_subquery,
 +                                                fout->isPostgresXL
 +                                                        ?  "(SELECT pclocatortype from pgxc_class v where v.pcrelid = c.oid) AS pgxclocatortype,"
 +                                                        "(SELECT pcattnum from pgxc_class v where v.pcrelid = c.oid) AS pgxcattnum,"
 +                                                        "(SELECT string_agg(node_name,',') AS pgxc_node_names from pgxc_node n where n.oid in (select unnest(nodeoids) from pgxc_class v where v.pcrelid=c.oid) ) , "
 +                                                        : "",
+                                                 RELKIND_SEQUENCE,
                                                  attacl_subquery->data,
                                                  attracl_subquery->data,
                                                  attinitacl_subquery->data,
                                                  "d.refobjid AS owning_tab, "
                                                  "d.refobjsubid AS owning_col, "
                                                  "(SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, "
 +#ifdef PGXC
 +                                                "%s"
 +#endif
                                                  "c.reloptions AS reloptions, "
                                                  "tc.reloptions AS toast_reloptions, "
-                                                 "NULL AS changed_acl "
+                                                 "NULL AS changed_acl, "
+                                                 "NULL AS partkeydef, "
+                                                 "false AS ispartition, "
+                                                 "NULL AS partbound "
                                                  "FROM pg_class c "
                                                  "LEFT JOIN pg_depend d ON "
                                                  "(c.relkind = '%c' AND "
index a6bc86ff311e9f2de0a4e8e4414b0c065d578e91,4afffc0690daf22b09d5a10fcdc75fb979922070..75aa065e5d295398c8845b829d04a3a9b765338b
@@@ -286,14 -292,10 +292,16 @@@ typedef struct _tableInf
        int                     relpages;               /* table's size in pages (from pg_class) */
  
        bool            interesting;    /* true if need to collect more data */
+       bool            dummy_view;             /* view's real definition must be postponed */
        bool            postponed_def;  /* matview must be postponed into post-data */
+       bool            ispartition;    /* is table a partition? */
  
 +#ifdef PGXC
 +      /* PGXC table locator Data */
 +      char            pgxclocatortype;        /* Type of PGXC table locator */
 +      int                     pgxcattnum;             /* Number of the attribute the table is partitioned with */
 +      char            *pgxc_node_names;       /* List of node names where this table is distributed */
 +#endif
        /*
         * These fields are computed only if we decide the table is interesting
         * (it's either a table to dump, or a direct parent of a dumpable table).
index 60c7ba5e8ed4468486752772d3f88af68baabf67,68003c35331b40a655d71b6f2ef5e59400e76c6e..9534134e61856b58360fb1178d554b4c2776be4d
@@@ -134,12 -130,13 +139,16 @@@ main(int argc, char *argv[]
                {"quote-all-identifiers", no_argument, &quote_all_identifiers, 1},
                {"role", required_argument, NULL, 3},
                {"use-set-session-authorization", no_argument, &use_setsessauth, 1},
+               {"no-publications", no_argument, &no_publications, 1},
                {"no-security-labels", no_argument, &no_security_labels, 1},
+               {"no-subscriptions", no_argument, &no_subscriptions, 1},
+               {"no-sync", no_argument, NULL, 4},
                {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1},
 -
+               {"no-role-passwords", no_argument, &no_role_passwords, 1},
 +#ifdef PGXC
 +              {"dump-nodes", no_argument, &dump_nodes, 1},
 +              {"include-nodes", no_argument, &include_nodes, 1},
 +#endif
                {NULL, 0, NULL, 0}
        };
  
Simple merge
index 3272d999eff89a34d305662a26528bae30f7097c,852d8ca4b1c6cbdeee4601b84fda27dfed396796..08f3aff908f5f738b3537aff89c101d74d39a8c9
  #include "storage/standbydefs.h"
  #include "utils/relmapper.h"
  
- #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
 +#ifdef XCP
 +#include "pgxc/barrier.h"
 +#endif
 +
+ #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
        { name, desc, identify},
  
  const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
Simple merge
index 9300a56dad2c8648acbb5ab364bd6406b660cd77,b3263a9570afc56dc96fc4885e19d8083b626496..c376d96989aebfbc02ce59f9b49c2029e55a4ec8
@@@ -2037,17 -3247,16 +3247,20 @@@ connection_warnings(bool in_startup
                }
                /* For version match, only print psql banner on startup. */
                else if (in_startup)
 +#ifdef PGXC
 +                      printf("%s (PGXL %s, based on PG %s)\n", pset.progname, PGXC_VERSION, PG_VERSION);
 +#else
                        printf("%s (%s)\n", pset.progname, PG_VERSION);
 +#endif
  
                if (pset.sversion / 100 > client_ver / 100)
-                       printf(_("WARNING: %s major version %d.%d, server major version %d.%d.\n"
+                       printf(_("WARNING: %s major version %s, server major version %s.\n"
                                         "         Some psql features might not work.\n"),
-                                pset.progname, client_ver / 10000, (client_ver / 100) % 100,
-                                  pset.sversion / 10000, (pset.sversion / 100) % 100);
+                                  pset.progname,
+                                  formatPGVersionNumber(client_ver, false,
+                                                                                cverbuf, sizeof(cverbuf)),
+                                  formatPGVersionNumber(pset.sversion, false,
+                                                                                sverbuf, sizeof(sverbuf)));
  
  #ifdef WIN32
                checkWin32Codepage();
Simple merge
Simple merge
index faa3bffc9461dd47ffd475a4a2e8ba8664526330,2abd08758df51b93210d76e11e290130df7fc42c..04e6a21bb3cc0b28bb3fd488b145760f3de7d2dc
@@@ -889,12 -997,12 +1012,13 @@@ typedef struc
  
  #define THING_NO_CREATE               (1 << 0)        /* should not show up after CREATE */
  #define THING_NO_DROP         (1 << 1)        /* should not show up after DROP */
- #define THING_NO_SHOW         (THING_NO_CREATE | THING_NO_DROP)
+ #define THING_NO_ALTER                (1 << 2)        /* should not show up after ALTER */
+ #define THING_NO_SHOW         (THING_NO_CREATE | THING_NO_DROP | THING_NO_ALTER)
  
  static const pgsql_thing_t words_after_create[] = {
-       {"ACCESS METHOD", NULL, NULL},
+       {"ACCESS METHOD", NULL, NULL, THING_NO_ALTER},
        {"AGGREGATE", NULL, &Query_for_list_of_aggregates},
 +      {"BARRIER", NULL, NULL},        /* Comes barrier name next, so skip it */
        {"CAST", NULL, NULL},           /* Casts have complex structures for names, so
                                                                 * skip it */
        {"COLLATION", "SELECT pg_catalog.quote_ident(collname) FROM pg_catalog.pg_collation WHERE collencoding IN (-1, pg_catalog.pg_char_to_encoding(pg_catalog.getdatabaseencoding())) AND substring(pg_catalog.quote_ident(collname),1,%d)='%s'"},
        {"DOMAIN", NULL, &Query_for_list_of_domains},
        {"EVENT TRIGGER", NULL, NULL},
        {"EXTENSION", Query_for_list_of_extensions},
 -      {"FOREIGN DATA WRAPPER", NULL, NULL},
 -      {"FOREIGN TABLE", NULL, NULL},
        {"FUNCTION", NULL, &Query_for_list_of_functions},
        {"GROUP", Query_for_list_of_roles},
-       {"LANGUAGE", Query_for_list_of_languages},
        {"INDEX", NULL, &Query_for_list_of_indexes},
+       {"LANGUAGE", Query_for_list_of_languages},
+       {"LARGE OBJECT", NULL, NULL, THING_NO_CREATE | THING_NO_DROP},
 +      {"NODE", Query_for_list_of_available_nodenames},
 +      {"NODE GROUP", Query_for_list_of_available_nodegroup_names},
        {"MATERIALIZED VIEW", NULL, &Query_for_list_of_matviews},
        {"OPERATOR", NULL, NULL},       /* Querying for this is probably not such a
                                                                 * good idea. */
        {"RULE", "SELECT pg_catalog.quote_ident(rulename) FROM pg_catalog.pg_rules WHERE substring(pg_catalog.quote_ident(rulename),1,%d)='%s'"},
        {"SCHEMA", Query_for_list_of_schemas},
        {"SEQUENCE", NULL, &Query_for_list_of_sequences},
+       {"SERVER", Query_for_list_of_servers},
+       {"STATISTICS", NULL, &Query_for_list_of_statistics},
+       {"SUBSCRIPTION", Query_for_list_of_subscriptions},
+       {"SYSTEM", NULL, NULL, THING_NO_CREATE | THING_NO_DROP},
        {"TABLE", NULL, &Query_for_list_of_tables},
        {"TABLESPACE", Query_for_list_of_tablespaces},
-       {"TEMP", NULL, NULL, THING_NO_DROP},            /* for CREATE TEMP TABLE ... */
+       {"TEMP", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},           /* for CREATE TEMP TABLE
+                                                                                                                                * ... */
        {"TEMPLATE", Query_for_list_of_ts_templates, NULL, THING_NO_SHOW},
+       {"TEMPORARY", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},      /* for CREATE TEMPORARY
+                                                                                                                                * TABLE ... */
        {"TEXT SEARCH", NULL, NULL},
+       {"TRANSFORM", NULL, NULL},
+       {"TRIGGER", "SELECT pg_catalog.quote_ident(tgname) FROM pg_catalog.pg_trigger WHERE substring(pg_catalog.quote_ident(tgname),1,%d)='%s' AND NOT tgisinternal"},
        {"TYPE", NULL, &Query_for_list_of_datatypes},
-       {"UNIQUE", NULL, NULL, THING_NO_DROP},          /* for CREATE UNIQUE INDEX ... */
-       {"UNLOGGED", NULL, NULL, THING_NO_DROP},        /* for CREATE UNLOGGED TABLE
-                                                                                                * ... */
+       {"UNIQUE", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},         /* for CREATE UNIQUE
+                                                                                                                                * INDEX ... */
+       {"UNLOGGED", NULL, NULL, THING_NO_DROP | THING_NO_ALTER},       /* for CREATE UNLOGGED
+                                                                                                                                * TABLE ... */
        {"USER", Query_for_list_of_roles},
 -      {"USER MAPPING FOR", NULL, NULL},
        {"VIEW", NULL, &Query_for_list_of_views},
        {NULL}                                          /* end of list */
  };
@@@ -1392,18 -1517,54 +1532,66 @@@ psql_completion(const char *text, int s
                else
                        COMPLETE_WITH_FUNCTION_ARG(prev2_wd);
        }
 +
 +      /* ALTER NODE */
 +      else if (Matches2("ALTER", "NODE"))
 +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);
 +      else if (Matches2("ALTER", "NODE"))
 +              COMPLETE_WITH_CONST("WITH");
 +      else if (Matches3("ALTER", "NODE", "WITH"))
 +              COMPLETE_WITH_CONST("(");
 +      else if (Matches3("ALTER", "NODE", "WITH"))
 +
 +              COMPLETE_WITH_LIST5("TYPE", "HOST", "PORT", "PRIMARY", "PREFERRED");
 +
+       /* ALTER PUBLICATION <name> */
+       else if (Matches3("ALTER", "PUBLICATION", MatchAny))
+       {
+               COMPLETE_WITH_LIST5("ADD TABLE", "DROP TABLE", "OWNER TO", "RENAME TO", "SET");
+       }
+       /* ALTER PUBLICATION <name> SET */
+       else if (Matches4("ALTER", "PUBLICATION", MatchAny, "SET"))
+       {
+               COMPLETE_WITH_LIST2("(", "TABLE");
+       }
+       /* ALTER PUBLICATION <name> SET ( */
+       else if (HeadMatches3("ALTER", "PUBLICATION", MatchAny) && TailMatches2("SET", "("))
+       {
+               COMPLETE_WITH_CONST("publish");
+       }
+       /* ALTER SUBSCRIPTION <name> */
+       else if (Matches3("ALTER", "SUBSCRIPTION", MatchAny))
+       {
+               COMPLETE_WITH_LIST7("CONNECTION", "ENABLE", "DISABLE", "OWNER TO",
+                                                       "RENAME TO", "REFRESH PUBLICATION", "SET");
+       }
+       /* ALTER SUBSCRIPTION <name> REFRESH PUBLICATION */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) &&
+                        TailMatches2("REFRESH", "PUBLICATION"))
+       {
+               COMPLETE_WITH_CONST("WITH (");
+       }
+       /* ALTER SUBSCRIPTION <name> REFRESH PUBLICATION WITH ( */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) &&
+                        TailMatches4("REFRESH", "PUBLICATION", "WITH", "("))
+       {
+               COMPLETE_WITH_CONST("copy_data");
+       }
+       /* ALTER SUBSCRIPTION <name> SET */
+       else if (Matches4("ALTER", "SUBSCRIPTION", MatchAny, "SET"))
+       {
+               COMPLETE_WITH_LIST2("(", "PUBLICATION");
+       }
+       /* ALTER SUBSCRIPTION <name> SET ( */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) && TailMatches2("SET", "("))
+       {
+               COMPLETE_WITH_LIST2("slot_name", "synchronous_commit");
+       }
+       /* ALTER SUBSCRIPTION <name> SET PUBLICATION */
+       else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) && TailMatches2("SET", "PUBLICATION"))
+       {
+               /* complete with nothing here as this refers to remote publications */
+       }
        /* ALTER SCHEMA <name> */
        else if (Matches3("ALTER", "SCHEMA", MatchAny))
                COMPLETE_WITH_LIST2("OWNER TO", "RENAME TO");
        else if (Matches3("DROP", "OWNED", "BY"))
                COMPLETE_WITH_QUERY(Query_for_list_of_roles);
  
 +      /* DROP NODE */
 +      else if (Matches2("DROP", "NODE"))
 +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);     /* Should test this code if complesion is not confused with DROP NODE GROUP */
 +
 +      /* DROP NODE GROUP */
 +      else if (Matches3("DROP", "NODE", "GROUP"))
 +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodegroup_names);
 +
 +      /* EXECUTE DIRECT */
 +      else if (Matches2("EXECUTE", "DIRECT"))
 +              COMPLETE_WITH_CONST("ON");
 +      else if (Matches3("EXECUTE", "DIRECT", "ON"))
 +              COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);
 +
+       /* DROP TEXT SEARCH */
        else if (Matches3("DROP", "TEXT", "SEARCH"))
                COMPLETE_WITH_LIST4("CONFIGURATION", "DICTIONARY", "PARSER", "TEMPLATE");
  
Simple merge
Simple merge
Simple merge
index ce6a70687e35e702f53177b9a6ec6cca0fe793bd,3a210a876b0cffa73a78760090c078fd29697566..c608b03bb070e4bf06115fa3bdbcb6a2deddb20e
@@@ -364,15 -408,19 +408,24 @@@ extern bool _hash_convert_tuple(Relatio
                                        Datum *index_values, bool *index_isnull);
  extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
  extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
+ extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
+ extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
+ extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
+                                                                  uint32 lowmask, uint32 maxbucket);
+ extern void _hash_kill_items(IndexScanDesc scan);
  
  /* hash.c */
- extern void hash_redo(XLogReaderState *record);
- extern void hash_desc(StringInfo buf, XLogReaderState *record);
- extern const char *hash_identify(uint8 info);
+ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
+                                 Buffer bucket_buf, BlockNumber bucket_blkno,
+                                 BufferAccessStrategy bstrategy,
+                                 uint32 maxbucket, uint32 highmask, uint32 lowmask,
+                                 double *tuples_removed, double *num_index_tuples,
+                                 bool bucket_has_garbage,
+                                 IndexBulkDeleteCallback callback, void *callback_state);
  
 +#ifdef PGXC
 +extern Datum compute_hash(Oid type, Datum value, char locator);
 +extern char *get_compute_hash_function(Oid type, char locator);
 +#endif
 +
  #endif   /* HASH_H */
index 01d5a6f92619f67289456560b0df27318ec9b88c,870adf4f77bd6df55ca09ad50e38dcb9af5d11dc..1d31b5f1c2d887f90d8b23f01e46c01ae41070b6
@@@ -4,8 -4,7 +4,8 @@@
   *      POSTGRES heap tuple definitions.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/access/htup.h
index 77920395c15941002ff5594dc64223d67af63d4e,2f43c199d37189a967df337108951e2cc0b73fd4..0988cb410387fcb9eebad3af4dcca9a4d5453a95
   */
  
  /* symbol name, textual name, redo, desc, identify, startup, cleanup */
- PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL)
- PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL)
- PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL)
- PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL)
- PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL)
- PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL)
- PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL)
- PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL)
- PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL)
- PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL)
- PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL)
- PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL)
- PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL)
- PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup)
- PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
- PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
- PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
- PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)
- PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL)
- PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL)
+ PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
+ PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
+ PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+ PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
+ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
+ PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
+ PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
+ PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
+ PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
+ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
 +#ifdef PGXC
- PG_RMGR(RM_BARRIER_ID, "Barrier", barrier_redo, barrier_desc, NULL, NULL, NULL) 
++PG_RMGR(RM_BARRIER_ID, "Barrier", barrier_redo, barrier_desc, barrier_identify, NULL, NULL, NULL) 
 +#endif
- PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL)
- PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL)
+ PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
+ PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
Simple merge
index 395953a6f13b5247b90c9e28b082ee03ca814f33,d25a2dd2073f406f1ba471d564fbe5d14b2e31bd..e357d5dea874c4315aa14c3b08886931b664850c
@@@ -4,10 -4,8 +4,10 @@@
   *      postgres transaction access method support code
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/access/transam.h
   *
Simple merge
index 063c8c2af3a7f4c59d882420903b59e3d4f77c96,7eb85b72df2fd2e3caad17c4a756e4b1e95117e0..2186e706a63c3b762cb6a3ad0af962b9232c27de
@@@ -4,10 -4,8 +4,10 @@@
   *      postgres transaction system definitions
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/access/xact.h
   *
index 0f5a0a3720023262373461819e7e1440c83d8929,e00ab12d2ee64e3a91aad425d4f423eddf41e1e8..4a633a7fad96a0bc39a5b12aa7d701ec8d5d1bd6
@@@ -83,9 -83,7 +83,10 @@@ typedef enu
        RECOVERY_TARGET_XID,
        RECOVERY_TARGET_TIME,
        RECOVERY_TARGET_NAME,
 +#ifdef PGXC
 +      RECOVERY_TARGET_BARRIER,
 +#endif
+       RECOVERY_TARGET_LSN,
        RECOVERY_TARGET_IMMEDIATE
  } RecoveryTargetType;
  
index 0daf681a22a72b5b6146f2ca6cbc612b7cbb2fe6,cb123e4d6469851dce072fb16c949bc324a52cd9..51a0ba925fbefd8ac1f706217ff02ecfd5a655bd
@@@ -4,9 -4,8 +4,9 @@@
   *      include file for the bootstrapping code
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/bootstrap/bootstrap.h
   *
diff --cc src/include/c.h
Simple merge
index b90ffa1c83fa0c2cc1ba834b23e5575d364f9703,d0a199afde7d1e2de269fde0f81560fca4227f48..7062d7ed2fbabd14c376f0cb07a2b5f0cbcd44b9
@@@ -4,8 -4,7 +4,8 @@@
   *      prototypes for functions in backend/catalog/catalog.c
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/catalog/catalog.h
index 2aff4968815b8580279836f7ccca72337066f91f,8586b9d7a1f0a1122b607a13cf49875e7d2e3c39..c4d0c694f466c8964005a462bffbea389ec293e8
@@@ -4,9 -4,8 +4,9 @@@
   *      Routines to support inter-object dependencies.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/catalog/dependency.h
   *
@@@ -184,15 -186,6 +192,11 @@@ extern void performDeletion(const Objec
  extern void performMultipleDeletions(const ObjectAddresses *objects,
                                                 DropBehavior behavior, int flags);
  
- extern void deleteWhatDependsOn(const ObjectAddress *object,
-                                       bool showNotices);
 +#ifdef PGXC
 +extern void performRename(const ObjectAddress *object,
 +                                                const char *oldname,
 +                                                const char *newname);
 +#endif
  extern void recordDependencyOnExpr(const ObjectAddress *depender,
                                           Node *expr, List *rtable,
                                           DependencyType behavior);
index a5f053fc525ee78983b65df3d502744e821db5c4,aa494528364367c72c36b97c8344d69d54050671..12ad62532be642d26e979d70926218548bcfd111
@@@ -4,9 -4,8 +4,9 @@@
   *      prototypes for functions in backend/catalog/heap.c
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/catalog/heap.h
   *
@@@ -135,24 -134,16 +135,35 @@@ extern void CheckAttributeType(const ch
                                   List *containing_rowtypes,
                                   bool allow_system_table_mods);
  
 +#ifdef PGXC
 +/* Functions related to distribution data of relations */
 +extern void AddRelationDistribution(Oid relid,
 +                              DistributeBy *distributeby,
 +                              PGXCSubCluster *subcluster,
 +                              List             *parentOids,
 +                              TupleDesc        descriptor);
 +extern void GetRelationDistributionItems(Oid relid,
 +                                                                               DistributeBy *distributeby,
 +                                                                               TupleDesc descriptor,
 +                                                                               char *locatortype,
 +                                                                               int *hashalgorithm,
 +                                                                               int *hashbuckets,
 +                                                                               AttrNumber *attnum);
 +extern Oid *GetRelationDistributionNodes(PGXCSubCluster *subcluster,
 +                                                                               int *numnodes);
 +extern Oid *BuildRelationDistributionNodes(List *nodes, int *numnodes);
 +extern Oid *SortRelationDistributionNodes(Oid *nodeoids, int numnodes);
 +#endif
+ /* pg_partitioned_table catalog manipulation functions */
+ extern void StorePartitionKey(Relation rel,
+                                 char strategy,
+                                 int16 partnatts,
+                                 AttrNumber *partattrs,
+                                 List *partexprs,
+                                 Oid *partopclass,
+                                 Oid *partcollation);
+ extern void RemovePartitionKeyByRelId(Oid relid);
+ extern void StorePartitionBound(Relation rel, Relation parent,
+                                       PartitionBoundSpec *bound);
  
  #endif   /* HEAP_H */
index f2d8be7856253ab2093d6df9846a4246306d128f,07300f8a2bd9e50a881f65754766b3e21923cf99..35f50b69a5af20d707c0158ad9505abecde1a908
@@@ -5,9 -5,8 +5,9 @@@
   *      on system catalogs
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/catalog/indexing.h
   *
index 3c31dafa9d402b2f4ea8487adaea20c99811415d,5294a52984989e683aaddcfff09b8ebe742ee775..14df88290aec40e4fe3a91d834815a85781a9b8b
@@@ -4,8 -4,7 +4,8 @@@
   *      prototypes for functions in backend/catalog/namespace.c
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/catalog/namespace.h
Simple merge
index 308af498122ac5f97fcae0b0cbb426ccff374f8c,cb42abf5f8d2550867e1fc43f6448b5b1db061af..010190e35c235d1fda8b5a8caed4f4321e2ba7fc
@@@ -5,8 -5,7 +5,8 @@@
   *      along with the relation's initial contents.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/catalog/pg_namespace.h
index 7fccccf2a4fe7f4eba9e45ca3640f44b9437f2b5,460cdb9ed816f8e49e68f0578d80d6bd13d4cbf1..e06ed6cc775ab9fb9ebcdaa5a879a8feddcd65cf
@@@ -4,8 -4,7 +4,8 @@@
   *      definition of the system "procedure" relation (pg_proc)
   *      along with the relation's initial contents.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/catalog/pg_proc.h
@@@ -5319,20 -5433,9 +5449,23 @@@ DESCR("get an individual replication or
  DATA(insert OID = 6014 ( pg_show_replication_origin_status PGNSP PGUID 12 1 100 0 0 f f f f f t v r 0 0 2249 "" "{26,25,3220,3220}" "{o,o,o,o}" "{local_id, external_id, remote_lsn, local_lsn}" _null_ _null_ pg_show_replication_origin_status _null_ _null_ _null_ ));
  DESCR("get progress for all replication origins");
  
 +#ifdef USE_MODULE_MSGIDS
 +DATA(insert OID = 6015 ( pg_msgmodule_set PGNSP PGUID 12 1 1 0 0 f f f f t t i s 4 0 16 "20 20 20 25" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_set _null_ _null_ _null_ ));
 +DESCR("set debugging level for module/file/msg");
 +DATA(insert OID = 6016 ( pg_msgmodule_change PGNSP PGUID 12 1 1 0 0 f f f f t t i s 4 0 16 "20 20 20 20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_change _null_ _null_ _null_ ));
 +DESCR("change debugging level for module/file/msg");
 +DATA(insert OID = 6017 ( pg_msgmodule_enable PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_enable _null_ _null_ _null_ ));
 +DESCR("pid to honour overriden log levels");
 +DATA(insert OID = 6018 ( pg_msgmodule_disable PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_disable _null_ _null_ _null_ ));
 +DESCR("pid to ignore overriden log levels");
 +DATA(insert OID = 6019 ( pg_msgmodule_enable_all PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "16" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_enable_all _null_ _null_ _null_ ));
 +DESCR("all current/future processes to honour overriden log levels");
 +DATA(insert OID = 6020 ( pg_msgmodule_disable_all PGNSP PGUID 12 1 1 0 0 f f f f t t i s 0 0 16 "" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_disable_all _null_ _null_ _null_ ));
 +DESCR("all processes to ignore overriden log levels");
 +#endif
+ /* publications */
+ DATA(insert OID = 6119 ( pg_get_publication_tables    PGNSP PGUID 12 1 1000 0 0 f f t f t t s s 1 0 26 "25" "{25,26}" "{i,o}" "{pubname,relid}" _null_ _null_ pg_get_publication_tables _null_ _null_ _null_ ));
+ DESCR("get OIDs of tables in a publication");
  
  /* rls */
  DATA(insert OID = 3298 (  row_security_active    PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 16 "26" _null_ _null_ _null_ _null_ _null_        row_security_active _null_ _null_ _null_ ));
index 439a22605e8dc3247f90c8360f06d1d086fce480,345e9164060b728fb25e5a1c6f98d007ca1bca6f..8dfbc8a15fbaa9d727bfce4a57ca71c5f855df57
@@@ -5,8 -5,7 +5,8 @@@
   *      along with the relation's initial contents.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/catalog/pg_type.h
Simple merge
Simple merge
index 072ce7858093fb963dcb365fdac758cce491fc8a,c60e6f30b88ddfed2976b5763e7f2e8a497fcb15..147f22b8701d9640760d6ec72edcb1c306809d8c
@@@ -36,18 -33,10 +36,19 @@@ typedef struc
        TimestampTz prepare_time;       /* the time when the stmt was prepared */
  } PreparedStatement;
  
 +#ifdef PGXC
 +typedef struct
 +{
 +      /* dynahash.c requires key to be first field */
 +      char            stmt_name[NAMEDATALEN];
 +      int             number_of_nodes;        /* number of nodes where statement is active */
 +      int             dns_node_indices[0];            /* node ids where statement is active */
 +} DatanodeStatement;
 +#endif
  
  /* Utility statements PREPARE, EXECUTE, DEALLOCATE, EXPLAIN EXECUTE */
- extern void PrepareQuery(PrepareStmt *stmt, const char *queryString);
+ extern void PrepareQuery(PrepareStmt *stmt, const char *queryString,
+                        int stmt_location, int stmt_len);
  extern void ExecuteQuery(ExecuteStmt *stmt, IntoClause *intoClause,
                         const char *queryString, ParamListInfo params,
                         DestReceiver *dest, char *completionTag);
index b87a10dd405de9fe89e83be24b47382e7462fc55,f07a389c7f6981f8a24dabe016f60db63c8e05dd..7079bfbf5a4e9ff6b4858cc492aff5e7443081ac
  #include "catalog/objectaddress.h"
  #include "nodes/parsenodes.h"
  
- #ifdef PGXC
  extern Oid CreateSchemaCommand(CreateSchemaStmt *parsetree,
-                                       const char *queryString, bool is_top_level);
- #else
- extern Oid CreateSchemaCommand(CreateSchemaStmt *parsetree,
-                                       const char *queryString);
- #endif
+                                       const char *queryString,
++                                      bool is_top_level,
+                                       int stmt_location, int stmt_len);
  extern void RemoveSchemaById(Oid schemaOid);
  
  extern ObjectAddress RenameSchema(const char *oldname, const char *newname);
index 0f82def1f13ad9819c85ef04c19be979bafb9b7b,304586e48e813eee4c936fa51c8fa498f311dd87..0e9533cc2de7f0e7159dffdaf0f9418fbad7f9fd
@@@ -3,8 -3,7 +3,8 @@@
   * sequence.h
   *      prototypes for sequence.c.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/commands/sequence.h
  #include "fmgr.h"
  #include "lib/stringinfo.h"
  #include "nodes/parsenodes.h"
+ #include "parser/parse_node.h"
  #include "storage/relfilenode.h"
  
 +#ifdef PGXC
 +#include "utils/relcache.h"
 +#include "gtm/gtm_c.h"
 +#include "access/xact.h"
 +#endif
  
- typedef struct FormData_pg_sequence
+ typedef struct FormData_pg_sequence_data
  {
-       NameData        sequence_name;
        int64           last_value;
-       int64           start_value;
-       int64           increment_by;
-       int64           max_value;
-       int64           min_value;
-       int64           cache_value;
        int64           log_cnt;
-       bool            is_cycled;
        bool            is_called;
- } FormData_pg_sequence;
+ } FormData_pg_sequence_data;
  
- typedef FormData_pg_sequence *Form_pg_sequence;
+ typedef FormData_pg_sequence_data *Form_pg_sequence_data;
  
  /*
   * Columns of a sequence relation
@@@ -87,26 -64,6 +70,27 @@@ extern void ResetSequenceCaches(void)
  extern void seq_redo(XLogReaderState *rptr);
  extern void seq_desc(StringInfo buf, XLogReaderState *rptr);
  extern const char *seq_identify(uint8 info);
+ extern void seq_mask(char *pagedata, BlockNumber blkno);
  
 +#ifdef XCP
 +#define DEFAULT_CACHEVAL      1
 +extern int SequenceRangeVal;
 +#endif
 +#ifdef PGXC
 +/*
 + * List of actions that registered the callback.
 + * This is listed here and not in sequence.c because callback can also
 + * be registered in dependency.c and tablecmds.c as sequences can be dropped
 + * or renamed in cascade.
 + */
 +typedef enum
 +{
 +      GTM_CREATE_SEQ,
 +      GTM_DROP_SEQ
 +} GTM_SequenceDropType;
 +
 +extern bool IsTempSequence(Oid relid);
 +extern char *GetGlobalSeqName(Relation rel, const char *new_seqname, const char *new_schemaname);
 +#endif
 +
  #endif   /* SEQUENCE_H */
Simple merge
Simple merge
index b87bf2ace97e000aec0b39a5744d26798eadb5f0,541c2fa3cf2f09fb3e2b31bd8c9719118e38ec6d..fd2dc860dd02c2b5fef426c9b43a4be439429910
@@@ -4,8 -4,7 +4,8 @@@
   *      header file for postgres vacuum cleaner and statistics analyzer
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/commands/vacuum.h
index 4997e1e166ce4a67e26225c6cbf84b0a11827e9d,247423c6fbe5e23b64306326dca9ea45ed62174c..e0fb3332dfb3fc374233344d380eefdd5b0ac29e
@@@ -2,8 -2,7 +2,8 @@@
   * variable.h
   *            Routines for handling specialized SET variables.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/commands/variable.h
Simple merge
index ca9edf539fd15170a80b890363b6e7f0e58017d2,37de6f2011332c80e7f30738da9cc3b14af02b14..62a2f2e477bbaff18a2ee491d35c61c0197f1058
@@@ -5,8 -5,7 +5,8 @@@
   *      and related modules.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/executor/execdesc.h
@@@ -51,13 -48,8 +51,15 @@@ typedef struct QueryDes
        EState     *estate;                     /* executor's query-wide state */
        PlanState  *planstate;          /* tree of per-plan-node state */
  
 +#ifdef XCP
 +      SharedQueue squeue;             /* the shared memory queue to sent data to other
 +                                                               * nodes */
 +      int             myindex;                /* -1 if locally executed subplan is producing
 +                                                               * data and distribute via squeue. Otherwise
 +                                                               * get local data from squeue */
 +#endif
+       /* This field is set by ExecutorRun */
+       bool            already_executed;               /* true if previously executed */
  
        /* This is always set NULL by the core system, but plugins can change it */
        struct Instrumentation *totaltime;      /* total time spent in ExecutorRun */
index 7b5cf2f1f769613c3d4673742d45eacc084e147c,8cc5f3a413f89d7cd06e6a0afc8f9410d240dee3..fdf7c15b70c662368f6a60a6ce5b30e715a85c3a
@@@ -4,8 -4,7 +4,8 @@@
   *      support for the POSTGRES executor module
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/executor/executor.h
  #define EXEC_FLAG_WITH_OIDS           0x0020  /* force OIDs in returned tuples */
  #define EXEC_FLAG_WITHOUT_OIDS        0x0040  /* force no OIDs in returned tuples */
  #define EXEC_FLAG_WITH_NO_DATA        0x0080  /* rel scannability doesn't matter */
 +#ifdef XCP
 +/* distributed executor may never execute the plan on this node  */
 +#define EXEC_FLAG_SUBPLAN             0x0100
 +#endif
  
  
- /*
-  * ExecEvalExpr was formerly a function containing a switch statement;
-  * now it's just a macro invoking the function pointed to by an ExprState
-  * node.  Beware of double evaluation of the ExprState argument!
-  */
- #define ExecEvalExpr(expr, econtext, isNull, isDone) \
-       ((*(expr)->evalfunc) (expr, econtext, isNull, isDone))
  /* Hook for plugins to get control in ExecutorStart() */
  typedef void (*ExecutorStart_hook_type) (QueryDesc *queryDesc, int eflags);
  extern PGDLLIMPORT ExecutorStart_hook_type ExecutorStart_hook;
Simple merge
index bfcca219e0c88f57ac4c03380e5cb88ea9e631fc,32489ef9bde05519f3defbb334d7ff6b14e52819..efdb6fee5a3874b1024b52871678b4f4cc2bec78
@@@ -4,8 -4,7 +4,8 @@@
   *      tuple table support stuff
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/executor/tuptable.h
Simple merge
index bb0d7d1dac6634190733f1678fb08a1864102907,4c607b299c2d3a2c31a99e9a67d15a2adb4117ee..343cbd9692bcc7e17aa10bba42a3254e2e5c3e38
@@@ -10,8 -10,7 +10,8 @@@
   *      Over time, this has also become the preferred place for widely known
   *      resource-limitation stuff, such as work_mem and check_stack_depth().
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/miscadmin.h
Simple merge
index 411d969b3b1908f868c8e72491379a069442e4f7,d33392f3b55341d7d85f91ad2f3835add95f0af4..2bc126dabe02d1f47bf9c538244dd9229789da72
@@@ -4,8 -4,7 +4,8 @@@
   *      definitions for executor state nodes
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/nodes/execnodes.h
  #include "utils/reltrigger.h"
  #include "utils/sortsupport.h"
  #include "utils/tuplestore.h"
 +#include "pgxc/squeue.h"
  #include "utils/tuplesort.h"
+ #include "nodes/tidbitmap.h"
+ #include "storage/condition_variable.h"
+ /* ----------------
+  *            ExprState node
+  *
+  * ExprState is the top-level node for expression evaluation.
+  * It contains instructions (in ->steps) to evaluate the expression.
+  * ----------------
+  */
+ struct ExprState;                             /* forward references in this file */
+ struct ExprContext;
+ struct ExprEvalStep;                  /* avoid including execExpr.h everywhere */
+ typedef Datum (*ExprStateEvalFunc) (struct ExprState *expression,
+                                                                                               struct ExprContext *econtext,
+                                                                                               bool *isNull);
+ /* Bits in ExprState->flags (see also execExpr.h for private flag bits): */
+ /* expression is for use with ExecQual() */
+ #define EEO_FLAG_IS_QUAL                                      (1 << 0)
+ typedef struct ExprState
+ {
+       Node            tag;
+       uint8           flags;                  /* bitmask of EEO_FLAG_* bits, see above */
+       /*
+        * Storage for result value of a scalar expression, or for individual
+        * column results within expressions built by ExecBuildProjectionInfo().
+        */
+       bool            resnull;
+       Datum           resvalue;
+       /*
+        * If projecting a tuple result, this slot holds the result; else NULL.
+        */
+       TupleTableSlot *resultslot;
+       /*
+        * Instructions to compute expression's return value.
+        */
+       struct ExprEvalStep *steps;
+       /*
+        * Function that actually evaluates the expression.  This can be set to
+        * different values depending on the complexity of the expression.
+        */
+       ExprStateEvalFunc evalfunc;
+       /* original expression tree, for debugging only */
+       Expr       *expr;
+       /*
+        * XXX: following only needed during "compilation", could be thrown away.
+        */
+       int                     steps_len;              /* number of steps currently */
+       int                     steps_alloc;    /* allocated length of steps array */
+       Datum      *innermost_caseval;
+       bool       *innermost_casenull;
+       Datum      *innermost_domainval;
+       bool       *innermost_domainnull;
+ } ExprState;
  
  
  /* ----------------
@@@ -373,12 -421,17 +423,22 @@@ typedef struct EStat
        ResultRelInfo *es_result_relations; /* array of ResultRelInfos */
        int                     es_num_result_relations;                /* length of array */
        ResultRelInfo *es_result_relation_info;         /* currently active array elt */
 +#ifdef PGXC
 +#ifndef PGXC
 +      struct PlanState        *es_result_remoterel;                   /* currently active remote rel */
 +#endif
 +#endif
  
+       /*
+        * Info about the target partitioned target table root(s) for
+        * update/delete queries.  They required only to fire any per-statement
+        * triggers defined on the table.  It exists separately from
+        * es_result_relations, because partitioned tables don't appear in the
+        * plan tree for the update/delete cases.
+        */
+       ResultRelInfo *es_root_result_relations;        /* array of ResultRelInfos */
+       int                     es_num_root_result_relations;   /* length of the array */
        /* Stuff used for firing triggers: */
        List       *es_trig_target_relations;           /* trigger-only ResultRelInfos */
        TupleTableSlot *es_trig_tuple_slot; /* for trigger output tuples */
index 88d615d6fd8812cdc95b0ad4d251306b1868a9bf,15de93635573affce3919fdab596bcf464368d78..df93faed901106821e68cc6c7478b0fe9bbd0687
@@@ -4,10 -4,8 +4,10 @@@
   *      Definitions for tagged nodes.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/nodes/nodes.h
   *
@@@ -209,10 -190,7 +219,11 @@@ typedef enum NodeTa
        T_FromExpr,
        T_OnConflictExpr,
        T_IntoClause,
 +#ifdef PGXC
 +      T_DistributeBy,
 +      T_PGXCSubCluster,
 +#endif
+       T_NextValueExpr,
  
        /*
         * TAGS FOR EXPRESSION STATE NODES (execnodes.h)
        T_PlaceHolderInfo,
        T_MinMaxAggInfo,
        T_PlannerParamItem,
 +#ifdef XCP
 +      T_RemoteSubPath,
 +#endif
+       T_RollupData,
+       T_GroupingSetData,
+       T_StatisticExtInfo,
        /*
         * TAGS FOR MEMORY NODES (memnodes.h)
         */
@@@ -594,13 -587,9 +632,12 @@@ castNodeImpl(NodeTag type, void *ptr
  /*
   * nodes/{outfuncs.c,print.c}
   */
- extern char *nodeToString(const void *obj);
 +#ifdef XCP
 +extern void set_portable_output(bool value);
 +#endif
  struct Bitmapset;                             /* not to include bitmapset.h here */
  struct StringInfoData;                        /* not to include stringinfo.h here */
  extern void outNode(struct StringInfoData *str, const void *obj);
  extern void outToken(struct StringInfoData *str, const char *s);
  extern void outBitmapset(struct StringInfoData *str,
index 79b310647b709a7cf17ee823a83407793eed2251,e19ac24582803f7300ee2236722a11434a7b9a0a..d9a48191f0dc9a2ed1a23472cc10756b9de1c9b0
@@@ -4,8 -4,7 +4,8 @@@
   *      Support for finding the values associated with Param nodes.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/nodes/params.h
index 8c78e3eb2b86c409b88d0efff796b6ac9d1c8377,8720e713c42cd83abd762398be7884496b5e1e98..8d4e58ca89cf9158d49473d6f50541a5d14a8285
@@@ -7,13 -7,13 +7,15 @@@
   * This is a byte (not character) offset in the original source text, to be
   * used for positioning an error cursor when there is an error related to
   * the node.  Access to the original source text is needed to make use of
-  * the location.
+  * the location.  At the topmost (statement) level, we also provide a
+  * statement length, likewise measured in bytes, for convenience in
+  * identifying statement boundaries in multi-statement source strings.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/nodes/parsenodes.h
   *
  #include "nodes/lockoptions.h"
  #include "nodes/primnodes.h"
  #include "nodes/value.h"
 +#ifdef PGXC
 +#include "access/tupdesc.h"
 +#include "pgxc/locator.h"
 +#endif
  
+ typedef enum OverridingKind
+ {
+       OVERRIDING_NOT_SET = 0,
+       OVERRIDING_USER_VALUE,
+       OVERRIDING_SYSTEM_VALUE
+ } OverridingKind;
  /* Possible sources of a Query */
  typedef enum QuerySource
  {
@@@ -787,11 -924,10 +930,13 @@@ typedef enum RTEKin
        RTE_SUBQUERY,                           /* subquery in FROM */
        RTE_JOIN,                                       /* join */
        RTE_FUNCTION,                           /* function in FROM */
+       RTE_TABLEFUNC,                          /* TableFunc(.., column list) */
        RTE_VALUES,                                     /* VALUES (<exprlist>), (<exprlist>), ... */
-       RTE_CTE                                         /* common table expr (WITH list element) */
 +#ifdef PGXC
-       ,RTE_REMOTE_DUMMY                       /* RTEs created by remote plan reduction */
++      RTE_REMOTE_DUMMY,                       /* RTEs created by remote plan reduction */
 +#endif /* PGXC */
+       RTE_CTE,                                        /* common table expr (WITH list element) */
+       RTE_NAMEDTUPLESTORE                     /* tuplestore, e.g. for AFTER triggers */
  } RTEKind;
  
  typedef struct RangeTblEntry
@@@ -2233,9 -2442,7 +2467,10 @@@ typedef struct CreateSeqStm
        RangeVar   *sequence;           /* the sequence to create */
        List       *options;
        Oid                     ownerId;                /* ID of owner, or InvalidOid for default */
 +#ifdef PGXC
 +      bool            is_serial;              /* Indicates if this sequence is part of SERIAL process */
 +#endif
+       bool            for_identity;
        bool            if_not_exists;  /* just do nothing if it already exists? */
  } CreateSeqStmt;
  
@@@ -2244,10 -2451,8 +2479,11 @@@ typedef struct AlterSeqStm
        NodeTag         type;
        RangeVar   *sequence;           /* the sequence to alter */
        List       *options;
+       bool            for_identity;
        bool            missing_ok;             /* skip error if a role is missing? */
 +#ifdef PGXC
 +      bool            is_serial;              /* Indicates if this sequence is part of SERIAL process */
 +#endif
  } AlterSeqStmt;
  
  /* ----------------------
@@@ -3215,29 -3344,65 +3458,89 @@@ typedef struct AlterTSConfigurationStm
        bool            missing_ok;             /* for DROP - skip error if missing? */
  } AlterTSConfigurationStmt;
  
 +/* PGXC_BEGIN */
 +/*
 + * EXECUTE DIRECT statement
 + */
 +typedef struct ExecDirectStmt
 +{
 +      NodeTag         type;
 +      List            *node_names;
 +      char            *query;
 +} ExecDirectStmt;
 +
 +/*
 + * CLEAN CONNECTION statement
 + */
 +typedef struct CleanConnStmt
 +{
 +      NodeTag         type;
 +      List            *nodes;         /* list of nodes dropped */
 +      char            *dbname;        /* name of database to drop connections */
 +      char            *username;      /* name of user whose connections are dropped */
 +      bool            is_coord;       /* type of connections dropped */
 +      bool            is_force;       /* option force  */
 +} CleanConnStmt;
 +/* PGXC_END */
  
+ typedef struct CreatePublicationStmt
+ {
+       NodeTag         type;
+       char       *pubname;            /* Name of of the publication */
+       List       *options;            /* List of DefElem nodes */
+       List       *tables;                     /* Optional list of tables to add */
+       bool            for_all_tables; /* Special publication for all tables in db */
+ } CreatePublicationStmt;
+ typedef struct AlterPublicationStmt
+ {
+       NodeTag         type;
+       char       *pubname;            /* Name of of the publication */
+       /* parameters used for ALTER PUBLICATION ... WITH */
+       List       *options;            /* List of DefElem nodes */
+       /* parameters used for ALTER PUBLICATION ... ADD/DROP TABLE */
+       List       *tables;                     /* List of tables to add/drop */
+       bool            for_all_tables; /* Special publication for all tables in db */
+       DefElemAction tableAction;      /* What action to perform with the tables */
+ } AlterPublicationStmt;
+ typedef struct CreateSubscriptionStmt
+ {
+       NodeTag         type;
+       char       *subname;            /* Name of of the subscription */
+       char       *conninfo;           /* Connection string to publisher */
+       List       *publication;        /* One or more publication to subscribe to */
+       List       *options;            /* List of DefElem nodes */
+ } CreateSubscriptionStmt;
+ typedef enum AlterSubscriptionType
+ {
+       ALTER_SUBSCRIPTION_OPTIONS,
+       ALTER_SUBSCRIPTION_CONNECTION,
+       ALTER_SUBSCRIPTION_PUBLICATION,
+       ALTER_SUBSCRIPTION_PUBLICATION_REFRESH,
+       ALTER_SUBSCRIPTION_REFRESH,
+       ALTER_SUBSCRIPTION_ENABLED
+ } AlterSubscriptionType;
+ typedef struct AlterSubscriptionStmt
+ {
+       NodeTag         type;
+       AlterSubscriptionType kind; /* ALTER_SUBSCRIPTION_OPTIONS, etc */
+       char       *subname;            /* Name of of the subscription */
+       char       *conninfo;           /* Connection string to publisher */
+       List       *publication;        /* One or more publication to subscribe to */
+       List       *options;            /* List of DefElem nodes */
+ } AlterSubscriptionStmt;
+ typedef struct DropSubscriptionStmt
+ {
+       NodeTag         type;
+       char       *subname;            /* Name of of the subscription */
+       bool            missing_ok;             /* Skip error if missing? */
+       DropBehavior behavior;          /* RESTRICT or CASCADE behavior */
+ } DropSubscriptionStmt;
  #endif   /* PARSENODES_H */
Simple merge
index d811d09cca61cd32f7bbf0e8f6fddcc120b6612f,d84372da386911ff4c989dbed72530a394b2379e..a4b9f18aa5da368c7e9826641794280120430b55
@@@ -4,8 -4,7 +4,8 @@@
   *      definitions for query plan nodes
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/nodes/plannodes.h
@@@ -74,19 -90,12 +91,25 @@@ typedef struct PlannedStm
        List       *invalItems;         /* other dependencies, as PlanInvalItems */
  
        int                     nParamExec;             /* number of PARAM_EXEC Params used */
 +#ifdef XCP
 +      int                     nParamRemote;   /* number of params sent from the master mode */
 +
 +      struct RemoteParam *remoteparams;/* parameter descriptors */
 +
 +      const char *pname;                      /* the portal name */
 +
 +      /* Parameters to filter out result rows */
 +      char            distributionType;
 +      AttrNumber  distributionKey;
 +      List       *distributionNodes;
 +      List       *distributionRestrict;
 +#endif        
+       Node       *utilityStmt;        /* non-null if this is utility stmt */
+       /* statement location in source string (copied from Query) */
+       int                     stmt_location;  /* start location, or -1 if unknown */
+       int                     stmt_len;               /* length in bytes; 0 means "rest of string" */
  } PlannedStmt;
  
  /* macro for fetching the Plan associated with a SubPlan node */
index f22afd38033702917a0d01e268ec3d6e3081f8a0,86ec82eaaae8637918b3d1a1df84ed20978b8192..66dd6b50e4c27fd54f1c96909ec46101a3669c77
@@@ -7,10 -7,8 +7,10 @@@
   *      and join trees.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/nodes/primnodes.h
   *
index d78197e8a7ab3cd0f214443deb89148f7919c24e,902e9faf12a327d7bafecc61c50cbd42f1e36dfb..1e7e6942d54125a688383c2f3d3f7716812dd888
@@@ -4,8 -4,7 +4,8 @@@
   *      Definitions for planner's internal data structures.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/nodes/relation.h
index 0a70840dd3cfbf8ffb9d95a6da7def3217c18322,3cf681e91b1762d7e2795ae54c8a829e4d9ad9f6..2701500a4a2ffd0902eff401965180e0a605a0c5
@@@ -4,8 -4,7 +4,8 @@@
   *      prototypes for costsize.c and clausesel.c.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/optimizer/cost.h
@@@ -75,10 -66,7 +75,8 @@@ extern bool enable_nestloop
  extern bool enable_material;
  extern bool enable_mergejoin;
  extern bool enable_hashjoin;
- #ifdef PGXC
 +extern bool enable_fast_query_shipping;
- #endif
+ extern bool enable_gathermerge;
  extern int    constraint_exclusion;
  
  extern double clamp_row_est(double nrows);
@@@ -102,13 -90,16 +100,19 @@@ extern void cost_subqueryscan(SubqueryS
                                  RelOptInfo *baserel, ParamPathInfo *param_info);
  extern void cost_functionscan(Path *path, PlannerInfo *root,
                                  RelOptInfo *baserel, ParamPathInfo *param_info);
+ extern void cost_tableexprscan(Path *path, PlannerInfo *root,
+                                  RelOptInfo *baserel, ParamPathInfo *param_info);
  extern void cost_valuesscan(Path *path, PlannerInfo *root,
                                RelOptInfo *baserel, ParamPathInfo *param_info);
 +#ifdef PGXC
 +extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel);
 +#endif
+ extern void cost_tablefuncscan(Path *path, PlannerInfo *root,
+                                  RelOptInfo *baserel, ParamPathInfo *param_info);
  extern void cost_ctescan(Path *path, PlannerInfo *root,
                         RelOptInfo *baserel, ParamPathInfo *param_info);
+ extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root,
+                                                RelOptInfo *baserel, ParamPathInfo *param_info);
  extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm);
  extern void cost_sort(Path *path, PlannerInfo *root,
                  List *pathkeys, Cost input_cost, double tuples, int width,
index e8ffed5f49b454cd0314018aaaac1c8e61be6260,245825c38b95854a50056904bc55e47ddf7caaec..7937deebdadd8322bf03b5fbdac447b1c2ba9bc2
@@@ -4,8 -4,7 +4,8 @@@
   *      prototypes for pathnode.c, relnode.c.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/optimizer/pathnode.h
@@@ -77,15 -80,28 +81,28 @@@ extern UniquePath *create_unique_path(P
  extern GatherPath *create_gather_path(PlannerInfo *root,
                                   RelOptInfo *rel, Path *subpath, PathTarget *target,
                                   Relids required_outer, double *rows);
+ extern GatherMergePath *create_gather_merge_path(PlannerInfo *root,
+                                                RelOptInfo *rel,
+                                                Path *subpath,
+                                                PathTarget *target,
+                                                List *pathkeys,
+                                                Relids required_outer,
+                                                double *rows);
  extern SubqueryScanPath *create_subqueryscan_path(PlannerInfo *root,
 -                                               RelOptInfo *rel, Path *subpath,
 -                                               List *pathkeys, Relids required_outer);
 +                                               RelOptInfo *rel, Path *subpath, List *pathkeys,
 +                                               Relids required_outer, Distribution *distribution);
  extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel,
                                                 List *pathkeys, Relids required_outer);
+ extern Path *create_tablexprscan_path(PlannerInfo *root, RelOptInfo *rel,
+                                                List *pathkeys, Relids required_outer);
  extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel,
                                           Relids required_outer);
+ extern Path *create_tablefuncscan_path(PlannerInfo *root, RelOptInfo *rel,
+                                                 Relids required_outer);
  extern Path *create_ctescan_path(PlannerInfo *root, RelOptInfo *rel,
                                        Relids required_outer);
+ extern Path *create_namedtuplestorescan_path(PlannerInfo *root, RelOptInfo *rel,
+                                                               Relids required_outer);
  extern Path *create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel,
                                                  Relids required_outer);
  extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
index 7f96df0beba5409de9e9a87fa7a44cf4257a7105,e773c0f7edacd267d7d348012381f418064d5a7a..4ef9ddb2ba9bc0204ecc045e5fb16ed394dc07d2
@@@ -4,8 -4,7 +4,8 @@@
   *      prototypes for various files in optimizer/plan
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/optimizer/planmain.h
Simple merge
index 6e3c47dafff4eb5c73b2e690ae68baa5e7c19e33,9b33ba5dfd175614d956ec7c300c951138bfc4d8..2c14b1e1af36381924ce7c39b057b248a2aad596
@@@ -4,8 -4,7 +4,8 @@@
   *            parse analysis for optimizable statements
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/parser/analyze.h
Simple merge
index ca265b4de243c377a4143b7671cb479e36d4b3a7,f50e45e886da8d3a120eecfb6dc3739b0a6bed1e..d10017583d66d5485ea3b0f2e2d99f0709b71e22
@@@ -7,9 -7,8 +7,9 @@@
   * by the PG_KEYWORD macro, which is not defined in this file; it can
   * be defined by the caller for special purposes.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * IDENTIFICATION
   *      src/include/parser/kwlist.h
@@@ -135,8 -129,8 +137,9 @@@ PG_KEYWORD("delimiter", DELIMITER, UNRE
  PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD)
  PG_KEYWORD("depends", DEPENDS, UNRESERVED_KEYWORD)
  PG_KEYWORD("desc", DESC, RESERVED_KEYWORD)
+ PG_KEYWORD("detach", DETACH, UNRESERVED_KEYWORD)
  PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD)
 +PG_KEYWORD("direct", DIRECT, UNRESERVED_KEYWORD)
  PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD)
  PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD)
  PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD)
@@@ -266,11 -255,9 +270,12 @@@ PG_KEYWORD("names", NAMES, UNRESERVED_K
  PG_KEYWORD("national", NATIONAL, COL_NAME_KEYWORD)
  PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD)
  PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD)
+ PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD)
  PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD)
  PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
 +#ifdef PGXC
 +PG_KEYWORD("node", NODE, UNRESERVED_KEYWORD)
 +#endif
  PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
  PG_KEYWORD("not", NOT, RESERVED_KEYWORD)
  PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD)
@@@ -328,10 -311,8 +335,11 @@@ PG_KEYWORD("privileges", PRIVILEGES, UN
  PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD)
  PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD)
  PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD)
+ PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD)
  PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD)
 +#ifdef PGXC
 +PG_KEYWORD("randomly", RANDOMLY, UNRESERVED_KEYWORD)
 +#endif
  PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD)
  PG_KEYWORD("read", READ, UNRESERVED_KEYWORD)
  PG_KEYWORD("real", REAL, COL_NAME_KEYWORD)
index 2c81da6c58645defce46cb541a0d6eab8ad9ae96,8a54d59d6f9478689859689ffb67cec99b9c7234..0ec3bc2e240944339e76538875abab327b226a39
@@@ -3,8 -3,7 +3,8 @@@
   * parse_agg.h
   *      handle aggregates and window functions in parser
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/parser/parse_agg.h
index e8ff93058d0155c00eaf0e9e4314af0fbada015c,5be1812242a6b33a61e22ddc24f46cdbe0fd5a50..e0dad6ac10a9547c5a5a7530c309d803e2e27eb0
@@@ -62,10 -62,11 +62,12 @@@ extern const char *func_signature_strin
  
  extern Oid LookupFuncName(List *funcname, int nargs, const Oid *argtypes,
                           bool noError);
- extern Oid LookupFuncNameTypeNames(List *funcname, List *argtypes,
-                                               bool noError);
- extern Oid LookupAggNameTypeNames(List *aggname, List *argtypes,
-                                          bool noError);
+ extern Oid LookupFuncWithArgs(ObjectWithArgs *func,
+                                  bool noError);
+ extern Oid LookupAggWithArgs(ObjectWithArgs *agg,
+                                 bool noError);
+ extern void check_srf_call_placement(ParseState *pstate, int location);
  
 +extern void check_pg_get_expr_args(ParseState *pstate, Oid fnoid, List *args);
  #endif   /* PARSE_FUNC_H */
Simple merge
index f4497d6ceab4500f735649c7505b89daff3542ac,8d0d17f8577a972824141279dcf920d7f8b088de..c3cdf7158c7e273f1363f824d7f23f3a5f17ea46
@@@ -4,10 -4,8 +4,10 @@@
   *            parse analysis for utility commands
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/parser/parse_utilcmd.h
   *
@@@ -32,8 -25,7 +32,10 @@@ extern IndexStmt *transformIndexStmt(Oi
  extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
                                  List **actions, Node **whereClause);
  extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
 +#ifdef PGXC
 +extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname);
 +#endif
+ extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation parent,
+                                               PartitionBoundSpec *spec);
  
  #endif   /* PARSE_UTILCMD_H */
Simple merge
Simple merge
Simple merge
index bb8cb3a5fb8aaa3cdb67a306ead12facb4a94e86,2d87a21ea8f4cbd7099928f369bad1660ace8d63..f920e356648a166784e4c83d0682156a07c618e7
  #define MEMSET_LOOP_LIMIT 1024
  
  /* Define to the address where bug reports for this package should be sent. */
 -#define PACKAGE_BUGREPORT "p[email protected]"
 +#define PACKAGE_BUGREPORT "p[email protected]"
  
  /* Define to the full name of this package. */
 -#define PACKAGE_NAME "PostgreSQL"
 +#define PACKAGE_NAME "Postgres-XL"
  
  /* Define to the full name and version of this package. */
- #define PACKAGE_STRING "Postgres-XL 9.6alpha1"
 -#define PACKAGE_STRING "PostgreSQL 10beta1"
++#define PACKAGE_STRING "Postgres-XL 10alpha1"
  
  /* Define to the version of this package. */
- #define PACKAGE_VERSION "9.6alpha1"
+ #define PACKAGE_VERSION "10beta1"
  
  /* Define to the name of a signed 128-bit integer type. */
  #undef PG_INT128_TYPE
  #define PG_INT64_TYPE long long int
  
  /* PostgreSQL version as a string */
- #define PG_VERSION "9.6beta4"
+ #define PG_VERSION "10beta1"
  
  /* PostgreSQL version as a number */
- #define PG_VERSION_NUM 90600
+ #define PG_VERSION_NUM 100000
  
  /* Define to the one symbol short name of this package. */
 -#define PACKAGE_TARNAME "postgresql"
 +#define PACKAGE_TARNAME "postgres-xl"
 +
 +/* Postgres-XC version as a string */
 +#define PGXC_VERSION "1.1devel"
 +
 +/* Postgres-XC version as a number */
 +#define PGXC_VERSION_NUM 10100
  
  /* Define to the name of the default PostgreSQL service principal in Kerberos.
     (--with-krb-srvnam=NAME) */
index bace5c6bd101a2abdee41caa8932f9726aec0ac4,5e029c0f4ef490a0a8d693de25d39571d4700474..5f58effe6c35e2a643ab3fbcce02168167b33a75
@@@ -3,8 -3,7 +3,8 @@@
   *
   *    Definitions for the PostgreSQL statistics collector daemon.
   *
-  *    Copyright (c) 2001-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  *    Copyright (c) 2001-2017, PostgreSQL Global Development Group
   *
   *    src/include/pgstat.h
   * ----------
@@@ -716,15 -734,176 +735,177 @@@ typedef enum BackendStat
   * Wait Classes
   * ----------
   */
- typedef enum WaitClass
+ #define PG_WAIT_LWLOCK                                0x01000000U
+ #define PG_WAIT_LOCK                          0x03000000U
+ #define PG_WAIT_BUFFER_PIN                    0x04000000U
+ #define PG_WAIT_ACTIVITY                      0x05000000U
+ #define PG_WAIT_CLIENT                                0x06000000U
+ #define PG_WAIT_EXTENSION                     0x07000000U
+ #define PG_WAIT_IPC                                   0x08000000U
+ #define PG_WAIT_TIMEOUT                               0x09000000U
+ #define PG_WAIT_IO                                    0x0A000000U
+ /* ----------
+  * Wait Events - Activity
+  *
+  * Use this category when a process is waiting because it has no work to do,
+  * unless the "Client" or "Timeout" category describes the situation better.
+  * Typically, this should only be used for background processes.
+  * ----------
+  */
+ typedef enum
  {
-       WAIT_UNDEFINED,
-       WAIT_LWLOCK_NAMED,
-       WAIT_LWLOCK_TRANCHE,
-       WAIT_LOCK,
-       WAIT_BUFFER_PIN
- }     WaitClass;
+       WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY,
+       WAIT_EVENT_AUTOVACUUM_MAIN,
+       WAIT_EVENT_BGWRITER_HIBERNATE,
+       WAIT_EVENT_BGWRITER_MAIN,
+       WAIT_EVENT_CHECKPOINTER_MAIN,
+       WAIT_EVENT_PGSTAT_MAIN,
+       WAIT_EVENT_RECOVERY_WAL_ALL,
+       WAIT_EVENT_RECOVERY_WAL_STREAM,
+       WAIT_EVENT_SYSLOGGER_MAIN,
+       WAIT_EVENT_WAL_RECEIVER_MAIN,
+       WAIT_EVENT_WAL_SENDER_MAIN,
+       WAIT_EVENT_WAL_WRITER_MAIN,
+       WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
 -      WAIT_EVENT_LOGICAL_APPLY_MAIN
++      WAIT_EVENT_LOGICAL_APPLY_MAIN,
++      WAIT_EVENT_CLUSTER_MONITOR_MAIN
+ } WaitEventActivity;
  
+ /* ----------
+  * Wait Events - Client
+  *
+  * Use this category when a process is waiting to send data to or receive data
+  * from the frontend process to which it is connected.  This is never used for
+  * a background process, which has no client connection.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_CLIENT_READ = PG_WAIT_CLIENT,
+       WAIT_EVENT_CLIENT_WRITE,
+       WAIT_EVENT_SSL_OPEN_SERVER,
+       WAIT_EVENT_WAL_RECEIVER_WAIT_START,
+       WAIT_EVENT_LIBPQWALRECEIVER,
+       WAIT_EVENT_WAL_SENDER_WAIT_WAL,
+       WAIT_EVENT_WAL_SENDER_WRITE_DATA
+ } WaitEventClient;
+ /* ----------
+  * Wait Events - IPC
+  *
+  * Use this category when a process cannot complete the work it is doing because
+  * it is waiting for a notification from another process.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC,
+       WAIT_EVENT_BGWORKER_STARTUP,
+       WAIT_EVENT_BTREE_PAGE,
+       WAIT_EVENT_EXECUTE_GATHER,
+       WAIT_EVENT_MQ_INTERNAL,
+       WAIT_EVENT_MQ_PUT_MESSAGE,
+       WAIT_EVENT_MQ_RECEIVE,
+       WAIT_EVENT_MQ_SEND,
+       WAIT_EVENT_PARALLEL_FINISH,
+       WAIT_EVENT_PARALLEL_BITMAP_SCAN,
+       WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
+       WAIT_EVENT_SAFE_SNAPSHOT,
+       WAIT_EVENT_SYNC_REP,
+       WAIT_EVENT_LOGICAL_SYNC_DATA,
+       WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE
+ } WaitEventIPC;
+ /* ----------
+  * Wait Events - Timeout
+  *
+  * Use this category when a process is waiting for a timeout to expire.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT,
+       WAIT_EVENT_PG_SLEEP,
+       WAIT_EVENT_RECOVERY_APPLY_DELAY
+ } WaitEventTimeout;
+ /* ----------
+  * Wait Events - IO
+  *
+  * Use this category when a process is waiting for a IO.
+  * ----------
+  */
+ typedef enum
+ {
+       WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
+       WAIT_EVENT_BUFFILE_WRITE,
+       WAIT_EVENT_CONTROL_FILE_READ,
+       WAIT_EVENT_CONTROL_FILE_SYNC,
+       WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
+       WAIT_EVENT_CONTROL_FILE_WRITE,
+       WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE,
+       WAIT_EVENT_COPY_FILE_READ,
+       WAIT_EVENT_COPY_FILE_WRITE,
+       WAIT_EVENT_DATA_FILE_EXTEND,
+       WAIT_EVENT_DATA_FILE_FLUSH,
+       WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC,
+       WAIT_EVENT_DATA_FILE_PREFETCH,
+       WAIT_EVENT_DATA_FILE_READ,
+       WAIT_EVENT_DATA_FILE_SYNC,
+       WAIT_EVENT_DATA_FILE_TRUNCATE,
+       WAIT_EVENT_DATA_FILE_WRITE,
+       WAIT_EVENT_DSM_FILL_ZERO_WRITE,
+       WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ,
+       WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC,
+       WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE,
+       WAIT_EVENT_LOCK_FILE_CREATE_READ,
+       WAIT_EVENT_LOCK_FILE_CREATE_SYNC,
+       WAIT_EVENT_LOCK_FILE_CREATE_WRITE,
+       WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ,
+       WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC,
+       WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC,
+       WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE,
+       WAIT_EVENT_LOGICAL_REWRITE_SYNC,
+       WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
+       WAIT_EVENT_LOGICAL_REWRITE_WRITE,
+       WAIT_EVENT_RELATION_MAP_READ,
+       WAIT_EVENT_RELATION_MAP_SYNC,
+       WAIT_EVENT_RELATION_MAP_WRITE,
+       WAIT_EVENT_REORDER_BUFFER_READ,
+       WAIT_EVENT_REORDER_BUFFER_WRITE,
+       WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ,
+       WAIT_EVENT_REPLICATION_SLOT_READ,
+       WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC,
+       WAIT_EVENT_REPLICATION_SLOT_SYNC,
+       WAIT_EVENT_REPLICATION_SLOT_WRITE,
+       WAIT_EVENT_SLRU_FLUSH_SYNC,
+       WAIT_EVENT_SLRU_READ,
+       WAIT_EVENT_SLRU_SYNC,
+       WAIT_EVENT_SLRU_WRITE,
+       WAIT_EVENT_SNAPBUILD_READ,
+       WAIT_EVENT_SNAPBUILD_SYNC,
+       WAIT_EVENT_SNAPBUILD_WRITE,
+       WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC,
+       WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE,
+       WAIT_EVENT_TIMELINE_HISTORY_READ,
+       WAIT_EVENT_TIMELINE_HISTORY_SYNC,
+       WAIT_EVENT_TIMELINE_HISTORY_WRITE,
+       WAIT_EVENT_TWOPHASE_FILE_READ,
+       WAIT_EVENT_TWOPHASE_FILE_SYNC,
+       WAIT_EVENT_TWOPHASE_FILE_WRITE,
+       WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ,
+       WAIT_EVENT_WAL_BOOTSTRAP_SYNC,
+       WAIT_EVENT_WAL_BOOTSTRAP_WRITE,
+       WAIT_EVENT_WAL_COPY_READ,
+       WAIT_EVENT_WAL_COPY_SYNC,
+       WAIT_EVENT_WAL_COPY_WRITE,
+       WAIT_EVENT_WAL_INIT_SYNC,
+       WAIT_EVENT_WAL_INIT_WRITE,
+       WAIT_EVENT_WAL_READ,
+       WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,
+       WAIT_EVENT_WAL_WRITE
+ } WaitEventIO;
  
  /* ----------
   * Command type for progress reporting purposes
index 7ef1b7b3d6391d6df115750358e500a5907ddd9c,52910ed203fc19f1369b5dd31eceaf48def5d10d..7d8e8a68d431a2800187328af51f65904ab7b208
@@@ -459,13 -466,14 +466,18 @@@ extern int      pg_check_dir(const char *dir
  /* port/pgmkdirp.c */
  extern int    pg_mkdir_p(char *path, int omode);
  
 +#ifndef PGSIGFUNC
 +#define PGSIGFUNC
  /* port/pqsignal.c */
  typedef void (*pqsigfunc) (int signo);
 +#endif
 +
  extern pqsigfunc pqsignal(int signo, pqsigfunc func);
+ #ifndef WIN32
+ extern pqsigfunc pqsignal_no_restart(int signo, pqsigfunc func);
+ #else
+ #define pqsignal_no_restart(signo, func) pqsignal(signo, func)
+ #endif
  
  /* port/quotes.c */
  extern char *escape_single_quotes_ascii(const char *src);
index 3b93f7b3bbcbc97339fb8bb00e88e844979c6fa7,f3582d5523aef4ef817356b08abd140ac956be1e..87df7844f4aba511b577e7f62fbde1459efb002a
@@@ -7,9 -7,8 +7,9 @@@
   * Client-side code should include postgres_fe.h instead.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1995, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/postgres.h
   *
index 5b1ee8fd218687571b02ebd01bbeb1d0fd35b493,d383fd3926eeb54a9b36e475f7fffb13b2a4d2c0..e61cc93e556432ad23533f9ce1c06fbb1801fb84
@@@ -4,9 -4,8 +4,9 @@@
   *      header file for integrated autovacuum daemon
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/postmaster/autovacuum.h
   *
  #ifndef AUTOVACUUM_H
  #define AUTOVACUUM_H
  
+ #include "storage/block.h"
+ /*
+  * Other processes can request specific work from autovacuum, identified by
+  * AutoVacuumWorkItem elements.
+  */
+ typedef enum
+ {
+       AVW_BRINSummarizeRange
+ } AutoVacuumWorkItemType;
  
 +
 +#ifdef PGXC  /* PGXC_DATANODE */
 +#define IsAutoVacuumAnalyzeWorker() (IsAutoVacuumWorkerProcess() && !(MyProc->vacuumFlags & PROC_IN_VACUUM))
 +#endif
 +
  /* GUC variables */
  extern bool autovacuum_start_daemon;
  extern int    autovacuum_max_workers;
Simple merge
index 4ec99d8a18c1d295a88110f949620ffb44d17569,9d1fc500820fcc6f1460f8488177a5098ad046f5..3445caeb9365d34f131e912bdba3686ffc44d2f5
@@@ -4,8 -4,7 +4,8 @@@
   *      POSTGRES backend id communication definitions
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/storage/backendid.h
Simple merge
index a8dfbf2ee6b10d5d7eb64a364bd7eb93ac29935d,0cd45bb6d8e959f81a8c8f80d71eb1542b40024e..c22daef179df85d33548836c9aafaa0dde93fcb6
@@@ -4,8 -4,7 +4,8 @@@
   *      Lightweight lock manager
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/storage/lwlock.h
@@@ -235,7 -211,8 +212,9 @@@ typedef enum BuiltinTrancheId
        LWTRANCHE_BUFFER_MAPPING,
        LWTRANCHE_LOCK_MANAGER,
        LWTRANCHE_PREDICATE_LOCK_MANAGER,
 +      LWTRANCHE_SHARED_QUEUES,
+       LWTRANCHE_PARALLEL_QUERY_DSA,
+       LWTRANCHE_TBM,
        LWTRANCHE_FIRST_USER_DEFINED
  }     BuiltinTrancheIds;
  
index bc336fbaff7d1f29ad4f7946c4f755a9bee1e6ad,2fbde36dad2d3491ce919d9c24631ac59a9509f7..0be7165d4f664ea7ec2f2d280f6e6c0e61498717
@@@ -4,8 -4,7 +4,8 @@@
   *      per-process shared memory data structures
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/storage/proc.h
@@@ -102,13 -112,9 +113,15 @@@ struct PGPRO
        BackendId       backendId;              /* This backend's backend ID (if assigned) */
        Oid                     databaseId;             /* OID of database this backend is using */
        Oid                     roleId;                 /* OID of role using this backend */
 +#ifdef XCP
 +      Oid                     coordId;                /* Oid of originating coordinator */
 +      int                     coordPid;               /* Pid of the originating session */
 +      BackendId       firstBackendId; /* Backend ID of the first backend of
 +                                                               * the distributed session */
 +#endif
  
+       bool            isBackgroundWorker;             /* true if background worker. */
        /*
         * While in hot standby mode, shows that a conflict signal has been sent
         * for the current transaction. Set/cleared while holding ProcArrayLock,
@@@ -262,16 -269,9 +281,15 @@@ extern PGPROC *PreparedXactProcs
   * Background writer, checkpointer and WAL writer run during normal operation.
   * Startup process and WAL receiver also consume 2 slots, but WAL writer is
   * launched only after startup has exited, so we only need 4 slots.
 + *
 + * PGXC needs another slot for the pool manager process
   */
 +#ifdef PGXC
 +#define NUM_AUXILIARY_PROCS           5
 +#else
  #define NUM_AUXILIARY_PROCS           4
 +#endif
  
  /* configurable options */
  extern int    DeadlockTimeout;
  extern int    StatementTimeout;
index ea12e5c795a2fd8e997541da18e0bbe37cb7e84c,22955a79dd448a3abf83c09617af48a38c6ff31d..bc46229b4265235746e03d2588c42ba6c27b10bc
@@@ -4,10 -4,8 +4,10 @@@
   *      POSTGRES process array definitions.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/storage/procarray.h
   *
  #include "utils/relcache.h"
  #include "utils/snapshot.h"
  
 +#ifdef XCP
 +extern int GlobalSnapshotSource;
 +
 +typedef enum GlobalSnapshotSourceType
 +{
 +      GLOBAL_SNAPSHOT_SOURCE_GTM,
 +      GLOBAL_SNAPSHOT_SOURCE_COORDINATOR
 +} GlobalSnapshotSourceType;
 +#endif
  
+ /*
+  * These are to implement PROCARRAY_FLAGS_XXX
+  *
+  * Note: These flags are cloned from PROC_XXX flags in src/include/storage/proc.h
+  * to avoid forcing to include proc.h when including procarray.h. So if you modify
+  * PROC_XXX flags, you need to modify these flags.
+  */
+ #define               PROCARRAY_VACUUM_FLAG                   0x02            /* currently running
+                                                                                                                * lazy vacuum */
+ #define               PROCARRAY_ANALYZE_FLAG                  0x04            /* currently running
+                                                                                                                * analyze */
+ #define               PROCARRAY_LOGICAL_DECODING_FLAG 0x10            /* currently doing
+                                                                                                                * logical decoding
+                                                                                                                * outside xact */
+ #define               PROCARRAY_SLOTS_XMIN                    0x20            /* replication slot
+                                                                                                                * xmin, catalog_xmin */
+ /*
+  * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching
+  * PGXACT->vacuumFlags. Other flags are used for different purposes and
+  * have no corresponding PROC flag equivalent.
+  */
+ #define               PROCARRAY_PROC_FLAGS_MASK       (PROCARRAY_VACUUM_FLAG | \
+                                                                                PROCARRAY_ANALYZE_FLAG | \
+                                                                                PROCARRAY_LOGICAL_DECODING_FLAG)
+ /* Use the following flags as an input "flags" to GetOldestXmin function */
+ /* Consider all backends except for logical decoding ones which manage xmin separately */
+ #define               PROCARRAY_FLAGS_DEFAULT                 PROCARRAY_LOGICAL_DECODING_FLAG
+ /* Ignore vacuum backends */
+ #define               PROCARRAY_FLAGS_VACUUM                  PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG
+ /* Ignore analyze backends */
+ #define               PROCARRAY_FLAGS_ANALYZE                 PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG
+ /* Ignore both vacuum and analyze backends */
+ #define               PROCARRAY_FLAGS_VACUUM_ANALYZE  PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG
  extern Size ProcArrayShmemSize(void);
  extern void CreateSharedProcArray(void);
  extern void ProcArrayAdd(PGPROC *proc);
@@@ -81,11 -89,9 +117,11 @@@ extern RunningTransactions GetRunningTr
  
  extern bool TransactionIdIsInProgress(TransactionId xid);
  extern bool TransactionIdIsActive(TransactionId xid);
- extern TransactionId GetOldestXmin(Relation rel, bool ignoreVacuum);
- extern TransactionId GetOldestXminInternal(Relation rel, bool ignoreVacuum,
+ extern TransactionId GetOldestXmin(Relation rel, int flags);
++extern TransactionId GetOldestXminInternal(Relation rel, int flags,
 +              bool computeLocal, TransactionId lastGlobalXmin);
  extern TransactionId GetOldestActiveTransactionId(void);
- extern TransactionId GetOldestSafeDecodingTransactionId(void);
+ extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
  
  extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
  extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
index 105fbaffea476269f7f6c2e8db9c64ced885c00a,d068dde5d76de916adbd9410f571f566e5d4776c..67cb9138294811adead209279b919d989cd5027a
@@@ -4,8 -4,7 +4,8 @@@
   *      Routines for interprocess signalling
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/storage/procsignal.h
Simple merge
index 7e384f6ea7af63d334fb7cfbf8621b21e340d3c7,9ce68296558cdc73fcf4d0d6f9ace3fbb20725c7..91a97f84b84a0b746de8898e6db564b84d3b45cc
@@@ -4,8 -4,7 +4,8 @@@
   *      storage manager switch public interface declarations.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/storage/smgr.h
index 746d106a3997a0a2ca1542794f88b820dec5f42a,c459af2e139ee598b7ee1cc6c64d7b3cf029025a..622d35b34668249291a5b5a27269c5ec053bc673
@@@ -57,8 -57,7 +57,8 @@@
   * calls in portal and cursor manipulations.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/tcop/dest.h
index ba34a8446adda4c5618be922635161847dd5b8b8,12ff4588c61abff139c8f6e4683d615d9c89e48c..e8ec5d0f8f0f8fcc9d932fe6f73cc4763f62e366
@@@ -4,8 -4,7 +4,8 @@@
   *      prototypes for pquery.c.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/tcop/pquery.h
index 30ff4cba9b88ad1d5ddbd1d27d919a2a5cf70f0b,f1a34a1c7248016f68d05c44d7db3d4ed86e7816..a32735bd3fe036d72c5e6c41e07c1f366761d342
  #include "nodes/plannodes.h"
  #include "storage/procsignal.h"
  #include "utils/guc.h"
+ #include "utils/queryenvironment.h"
  
 +/* needed because of 'struct timeval' and 'struct rusage' */
 +#include <sys/time.h>
 +#include <sys/resource.h>
  
  /* Required daylight between max_stack_depth and the kernel limit, in bytes */
  #define STACK_DEPTH_SLOP (512 * 1024L)
@@@ -50,13 -48,15 +51,16 @@@ typedef enu
  extern int    log_statement;
  
  extern List *pg_parse_query(const char *query_string);
- extern List *pg_analyze_and_rewrite(Node *parsetree, const char *query_string,
-                                          Oid *paramTypes, int numParams);
- extern List *pg_analyze_and_rewrite_params(Node *parsetree,
 +extern List *pg_parse_query_get_source(const char *query_string, List **queries);
+ extern List *pg_analyze_and_rewrite(RawStmt *parsetree,
+                                          const char *query_string,
+                                          Oid *paramTypes, int numParams,
+                                          QueryEnvironment *queryEnv);
+ extern List *pg_analyze_and_rewrite_params(RawStmt *parsetree,
                                                          const char *query_string,
                                                          ParserSetupHook parserSetup,
-                                                         void *parserSetupArg);
+                                                         void *parserSetupArg,
+                                                         QueryEnvironment *queryEnv);
  extern PlannedStmt *pg_plan_query(Query *querytree, int cursorOptions,
                          ParamListInfo boundParams);
  extern List *pg_plan_queries(List *querytrees, int cursorOptions,
index 15a8458280bed6a00c8f25cd46cdda659d92a504,14f65c34d66165533f636fea1381f3ffe3c3ca3f..d52f3e852513bee7dc8c3d386ed513b337bacb64
@@@ -24,30 -24,21 +24,27 @@@ typedef enu
  } ProcessUtilityContext;
  
  /* Hook for plugins to get control in ProcessUtility() */
- typedef void (*ProcessUtility_hook_type) (Node *parsetree,
+ typedef void (*ProcessUtility_hook_type) (PlannedStmt *pstmt,
                                          const char *queryString, ProcessUtilityContext context,
                                                                                                          ParamListInfo params,
 -                                                                      DestReceiver *dest, char *completionTag);
+                                                                                                 QueryEnvironment *queryEnv,
- #ifdef PGXC
 +                                                                      DestReceiver *dest,
- #endif /* PGXC */
 +                                                                      bool sentToRemote,
 +                                                                      char *completionTag);
  extern PGDLLIMPORT ProcessUtility_hook_type ProcessUtility_hook;
  
- extern void ProcessUtility(Node *parsetree, const char *queryString,
+ extern void ProcessUtility(PlannedStmt *pstmt, const char *queryString,
                           ProcessUtilityContext context, ParamListInfo params,
 -                         DestReceiver *dest, char *completionTag);
+                          QueryEnvironment *queryEnv,
- #ifdef PGXC
 +                         DestReceiver *dest,
- #endif /* PGXC */
 +                         bool sentToRemote,
- extern void standard_ProcessUtility(Node *parsetree, const char *queryString,
 +                         char *completionTag);
+ extern void standard_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
                                                ProcessUtilityContext context, ParamListInfo params,
 -                                              DestReceiver *dest, char *completionTag);
+                                               QueryEnvironment *queryEnv,
- #ifdef PGXC
 +                                              DestReceiver *dest,
- #endif /* PGXC */
 +                                              bool sentToRemote,
 +                                              char *completionTag);
  
  extern bool UtilityReturnsTuples(Node *parsetree);
  
@@@ -59,10 -50,6 +56,10 @@@ extern const char *CreateCommandTag(Nod
  
  extern LogStmtLevel GetCommandLogLevel(Node *parsetree);
  
- extern bool CommandIsReadOnly(Node *parsetree);
+ extern bool CommandIsReadOnly(PlannedStmt *pstmt);
  
 +#ifdef PGXC
 +extern bool pgxc_lock_for_utility_stmt(Node *parsetree);
 +#endif
 +
  #endif   /* UTILITY_H */
index fa7b94065946ec0b18851cfbb7fc73e833b4318a,1435a7b57a924d2dd673ea18a931585fb2965e76..ea93c922f1fd281095a79d67705125f26ac87064
@@@ -4,8 -4,7 +4,8 @@@
   *      Declarations for operations on built-in types.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/utils/builtins.h
  #define BUILTINS_H
  
  #include "fmgr.h"
- #include "lib/stringinfo.h"
 +#include "nodes/parsenodes.h"
 +#ifdef PGXC
 +#include "lib/stringinfo.h"
 +#endif
- #include "utils/sortsupport.h"
- /*
-  *            Defined in adt/
-  */
- /* acl.c */
- extern Datum has_any_column_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id(PG_FUNCTION_ARGS);
- /* amutils.c */
- extern Datum pg_indexam_has_property(PG_FUNCTION_ARGS);
- extern Datum pg_index_has_property(PG_FUNCTION_ARGS);
- extern Datum pg_index_column_has_property(PG_FUNCTION_ARGS);
+ #include "nodes/nodes.h"
+ #include "utils/fmgrprotos.h"
  
 -
  /* bool.c */
- extern Datum boolin(PG_FUNCTION_ARGS);
- extern Datum boolout(PG_FUNCTION_ARGS);
- extern Datum boolrecv(PG_FUNCTION_ARGS);
- extern Datum boolsend(PG_FUNCTION_ARGS);
- extern Datum booltext(PG_FUNCTION_ARGS);
- extern Datum booleq(PG_FUNCTION_ARGS);
- extern Datum boolne(PG_FUNCTION_ARGS);
- extern Datum boollt(PG_FUNCTION_ARGS);
- extern Datum boolgt(PG_FUNCTION_ARGS);
- extern Datum boolle(PG_FUNCTION_ARGS);
- extern Datum boolge(PG_FUNCTION_ARGS);
- extern Datum booland_statefunc(PG_FUNCTION_ARGS);
- extern Datum boolor_statefunc(PG_FUNCTION_ARGS);
- extern Datum bool_accum(PG_FUNCTION_ARGS);
- extern Datum bool_accum_inv(PG_FUNCTION_ARGS);
- extern Datum bool_alltrue(PG_FUNCTION_ARGS);
- extern Datum bool_anytrue(PG_FUNCTION_ARGS);
  extern bool parse_bool(const char *value, bool *result);
  extern bool parse_bool_with_len(const char *value, size_t len, bool *result);
  
@@@ -358,406 -64,17 +68,30 @@@ extern char *float8out_internal(double 
  extern int    float4_cmp_internal(float4 a, float4 b);
  extern int    float8_cmp_internal(float8 a, float8 b);
  
- extern Datum float4in(PG_FUNCTION_ARGS);
- extern Datum float4out(PG_FUNCTION_ARGS);
- extern Datum float4recv(PG_FUNCTION_ARGS);
- extern Datum float4send(PG_FUNCTION_ARGS);
- extern Datum float8in(PG_FUNCTION_ARGS);
- extern Datum float8out(PG_FUNCTION_ARGS);
- extern Datum float8recv(PG_FUNCTION_ARGS);
- extern Datum float8send(PG_FUNCTION_ARGS);
- extern Datum float4abs(PG_FUNCTION_ARGS);
- extern Datum float4um(PG_FUNCTION_ARGS);
- extern Datum float4up(PG_FUNCTION_ARGS);
- extern Datum float4larger(PG_FUNCTION_ARGS);
- extern Datum float4smaller(PG_FUNCTION_ARGS);
- extern Datum float8abs(PG_FUNCTION_ARGS);
- extern Datum float8um(PG_FUNCTION_ARGS);
- extern Datum float8up(PG_FUNCTION_ARGS);
- extern Datum float8larger(PG_FUNCTION_ARGS);
- extern Datum float8smaller(PG_FUNCTION_ARGS);
- extern Datum float4pl(PG_FUNCTION_ARGS);
- extern Datum float4mi(PG_FUNCTION_ARGS);
- extern Datum float4mul(PG_FUNCTION_ARGS);
- extern Datum float4div(PG_FUNCTION_ARGS);
- extern Datum float8pl(PG_FUNCTION_ARGS);
- extern Datum float8mi(PG_FUNCTION_ARGS);
- extern Datum float8mul(PG_FUNCTION_ARGS);
- extern Datum float8div(PG_FUNCTION_ARGS);
- extern Datum float4eq(PG_FUNCTION_ARGS);
- extern Datum float4ne(PG_FUNCTION_ARGS);
- extern Datum float4lt(PG_FUNCTION_ARGS);
- extern Datum float4le(PG_FUNCTION_ARGS);
- extern Datum float4gt(PG_FUNCTION_ARGS);
- extern Datum float4ge(PG_FUNCTION_ARGS);
- extern Datum float8eq(PG_FUNCTION_ARGS);
- extern Datum float8ne(PG_FUNCTION_ARGS);
- extern Datum float8lt(PG_FUNCTION_ARGS);
- extern Datum float8le(PG_FUNCTION_ARGS);
- extern Datum float8gt(PG_FUNCTION_ARGS);
- extern Datum float8ge(PG_FUNCTION_ARGS);
- extern Datum ftod(PG_FUNCTION_ARGS);
- extern Datum i4tod(PG_FUNCTION_ARGS);
- extern Datum i2tod(PG_FUNCTION_ARGS);
- extern Datum dtof(PG_FUNCTION_ARGS);
- extern Datum dtoi4(PG_FUNCTION_ARGS);
- extern Datum dtoi2(PG_FUNCTION_ARGS);
- extern Datum i4tof(PG_FUNCTION_ARGS);
- extern Datum i2tof(PG_FUNCTION_ARGS);
- extern Datum ftoi4(PG_FUNCTION_ARGS);
- extern Datum ftoi2(PG_FUNCTION_ARGS);
- extern Datum dround(PG_FUNCTION_ARGS);
- extern Datum dceil(PG_FUNCTION_ARGS);
- extern Datum dfloor(PG_FUNCTION_ARGS);
- extern Datum dsign(PG_FUNCTION_ARGS);
- extern Datum dtrunc(PG_FUNCTION_ARGS);
- extern Datum dsqrt(PG_FUNCTION_ARGS);
- extern Datum dcbrt(PG_FUNCTION_ARGS);
- extern Datum dpow(PG_FUNCTION_ARGS);
- extern Datum dexp(PG_FUNCTION_ARGS);
- extern Datum dlog1(PG_FUNCTION_ARGS);
- extern Datum dlog10(PG_FUNCTION_ARGS);
- extern Datum dacos(PG_FUNCTION_ARGS);
- extern Datum dasin(PG_FUNCTION_ARGS);
- extern Datum datan(PG_FUNCTION_ARGS);
- extern Datum datan2(PG_FUNCTION_ARGS);
- extern Datum dcos(PG_FUNCTION_ARGS);
- extern Datum dcot(PG_FUNCTION_ARGS);
- extern Datum dsin(PG_FUNCTION_ARGS);
- extern Datum dtan(PG_FUNCTION_ARGS);
- extern Datum dacosd(PG_FUNCTION_ARGS);
- extern Datum dasind(PG_FUNCTION_ARGS);
- extern Datum datand(PG_FUNCTION_ARGS);
- extern Datum datan2d(PG_FUNCTION_ARGS);
- extern Datum dcosd(PG_FUNCTION_ARGS);
- extern Datum dcotd(PG_FUNCTION_ARGS);
- extern Datum dsind(PG_FUNCTION_ARGS);
- extern Datum dtand(PG_FUNCTION_ARGS);
- extern Datum degrees(PG_FUNCTION_ARGS);
- extern Datum dpi(PG_FUNCTION_ARGS);
- extern Datum radians(PG_FUNCTION_ARGS);
- extern Datum drandom(PG_FUNCTION_ARGS);
- extern Datum setseed(PG_FUNCTION_ARGS);
- extern Datum float8_combine(PG_FUNCTION_ARGS);
- extern Datum float8_accum(PG_FUNCTION_ARGS);
- extern Datum float4_accum(PG_FUNCTION_ARGS);
- extern Datum float8_avg(PG_FUNCTION_ARGS);
- extern Datum float8_var_pop(PG_FUNCTION_ARGS);
- extern Datum float8_var_samp(PG_FUNCTION_ARGS);
- extern Datum float8_stddev_pop(PG_FUNCTION_ARGS);
- extern Datum float8_stddev_samp(PG_FUNCTION_ARGS);
- extern Datum float8_regr_accum(PG_FUNCTION_ARGS);
- extern Datum float8_regr_combine(PG_FUNCTION_ARGS);
- extern Datum float8_regr_sxx(PG_FUNCTION_ARGS);
- extern Datum float8_regr_syy(PG_FUNCTION_ARGS);
- extern Datum float8_regr_sxy(PG_FUNCTION_ARGS);
- extern Datum float8_regr_avgx(PG_FUNCTION_ARGS);
- extern Datum float8_regr_avgy(PG_FUNCTION_ARGS);
- extern Datum float8_covar_pop(PG_FUNCTION_ARGS);
- extern Datum float8_covar_samp(PG_FUNCTION_ARGS);
- extern Datum float8_corr(PG_FUNCTION_ARGS);
- extern Datum float8_regr_r2(PG_FUNCTION_ARGS);
- extern Datum float8_regr_slope(PG_FUNCTION_ARGS);
- extern Datum float8_regr_intercept(PG_FUNCTION_ARGS);
- extern Datum float48pl(PG_FUNCTION_ARGS);
- extern Datum float48mi(PG_FUNCTION_ARGS);
- extern Datum float48mul(PG_FUNCTION_ARGS);
- extern Datum float48div(PG_FUNCTION_ARGS);
- extern Datum float84pl(PG_FUNCTION_ARGS);
- extern Datum float84mi(PG_FUNCTION_ARGS);
- extern Datum float84mul(PG_FUNCTION_ARGS);
- extern Datum float84div(PG_FUNCTION_ARGS);
- extern Datum float48eq(PG_FUNCTION_ARGS);
- extern Datum float48ne(PG_FUNCTION_ARGS);
- extern Datum float48lt(PG_FUNCTION_ARGS);
- extern Datum float48le(PG_FUNCTION_ARGS);
- extern Datum float48gt(PG_FUNCTION_ARGS);
- extern Datum float48ge(PG_FUNCTION_ARGS);
- extern Datum float84eq(PG_FUNCTION_ARGS);
- extern Datum float84ne(PG_FUNCTION_ARGS);
- extern Datum float84lt(PG_FUNCTION_ARGS);
- extern Datum float84le(PG_FUNCTION_ARGS);
- extern Datum float84gt(PG_FUNCTION_ARGS);
- extern Datum float84ge(PG_FUNCTION_ARGS);
- extern Datum width_bucket_float8(PG_FUNCTION_ARGS);
- /* dbsize.c */
- extern Datum pg_tablespace_size_oid(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_size_name(PG_FUNCTION_ARGS);
- extern Datum pg_database_size_oid(PG_FUNCTION_ARGS);
- extern Datum pg_database_size_name(PG_FUNCTION_ARGS);
- extern Datum pg_relation_size(PG_FUNCTION_ARGS);
- extern Datum pg_total_relation_size(PG_FUNCTION_ARGS);
- extern Datum pg_size_pretty(PG_FUNCTION_ARGS);
- extern Datum pg_size_pretty_numeric(PG_FUNCTION_ARGS);
- extern Datum pg_size_bytes(PG_FUNCTION_ARGS);
- extern Datum pg_table_size(PG_FUNCTION_ARGS);
- extern Datum pg_indexes_size(PG_FUNCTION_ARGS);
- extern Datum pg_relation_filenode(PG_FUNCTION_ARGS);
- extern Datum pg_filenode_relation(PG_FUNCTION_ARGS);
- extern Datum pg_relation_filepath(PG_FUNCTION_ARGS);
- /* genfile.c */
- extern Datum pg_stat_file(PG_FUNCTION_ARGS);
- extern Datum pg_stat_file_1arg(PG_FUNCTION_ARGS);
- extern Datum pg_read_file(PG_FUNCTION_ARGS);
- extern Datum pg_read_file_off_len(PG_FUNCTION_ARGS);
- extern Datum pg_read_file_all(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file_off_len(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file_all(PG_FUNCTION_ARGS);
- extern Datum pg_ls_dir(PG_FUNCTION_ARGS);
- extern Datum pg_ls_dir_1arg(PG_FUNCTION_ARGS);
- /* misc.c */
- extern Datum pg_num_nulls(PG_FUNCTION_ARGS);
- extern Datum pg_num_nonnulls(PG_FUNCTION_ARGS);
- extern Datum current_database(PG_FUNCTION_ARGS);
- extern Datum current_query(PG_FUNCTION_ARGS);
- extern Datum pg_cancel_backend(PG_FUNCTION_ARGS);
- extern Datum pg_terminate_backend(PG_FUNCTION_ARGS);
- extern Datum pg_reload_conf(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_databases(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_location(PG_FUNCTION_ARGS);
- extern Datum pg_rotate_logfile(PG_FUNCTION_ARGS);
- extern Datum pg_sleep(PG_FUNCTION_ARGS);
- extern Datum pg_get_keywords(PG_FUNCTION_ARGS);
- extern Datum pg_typeof(PG_FUNCTION_ARGS);
- extern Datum pg_collation_for(PG_FUNCTION_ARGS);
- extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS);
- extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS);
- extern Datum parse_ident(PG_FUNCTION_ARGS);
  /* oid.c */
- extern Datum oidin(PG_FUNCTION_ARGS);
- extern Datum oidout(PG_FUNCTION_ARGS);
- extern Datum oidrecv(PG_FUNCTION_ARGS);
- extern Datum oidsend(PG_FUNCTION_ARGS);
- extern Datum oideq(PG_FUNCTION_ARGS);
- extern Datum oidne(PG_FUNCTION_ARGS);
- extern Datum oidlt(PG_FUNCTION_ARGS);
- extern Datum oidle(PG_FUNCTION_ARGS);
- extern Datum oidge(PG_FUNCTION_ARGS);
- extern Datum oidgt(PG_FUNCTION_ARGS);
- extern Datum oidlarger(PG_FUNCTION_ARGS);
- extern Datum oidsmaller(PG_FUNCTION_ARGS);
- extern Datum oidvectorin(PG_FUNCTION_ARGS);
- extern Datum oidvectorout(PG_FUNCTION_ARGS);
- extern Datum oidvectorrecv(PG_FUNCTION_ARGS);
- extern Datum oidvectorsend(PG_FUNCTION_ARGS);
- extern Datum oidvectoreq(PG_FUNCTION_ARGS);
- extern Datum oidvectorne(PG_FUNCTION_ARGS);
- extern Datum oidvectorlt(PG_FUNCTION_ARGS);
- extern Datum oidvectorle(PG_FUNCTION_ARGS);
- extern Datum oidvectorge(PG_FUNCTION_ARGS);
- extern Datum oidvectorgt(PG_FUNCTION_ARGS);
  extern oidvector *buildoidvector(const Oid *oids, int n);
  extern Oid    oidparse(Node *node);
- /* orderedsetaggs.c */
- extern Datum ordered_set_transition(PG_FUNCTION_ARGS);
- extern Datum ordered_set_transition_multi(PG_FUNCTION_ARGS);
- extern Datum percentile_disc_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_float8_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_interval_final(PG_FUNCTION_ARGS);
- extern Datum percentile_disc_multi_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_float8_multi_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_interval_multi_final(PG_FUNCTION_ARGS);
- extern Datum mode_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_rank_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_percent_rank_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_cume_dist_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_dense_rank_final(PG_FUNCTION_ARGS);
- /* pseudotypes.c */
- extern Datum cstring_in(PG_FUNCTION_ARGS);
- extern Datum cstring_out(PG_FUNCTION_ARGS);
- extern Datum cstring_recv(PG_FUNCTION_ARGS);
- extern Datum cstring_send(PG_FUNCTION_ARGS);
- extern Datum any_in(PG_FUNCTION_ARGS);
- extern Datum any_out(PG_FUNCTION_ARGS);
- extern Datum anyarray_in(PG_FUNCTION_ARGS);
- extern Datum anyarray_out(PG_FUNCTION_ARGS);
- extern Datum anyarray_recv(PG_FUNCTION_ARGS);
- extern Datum anyarray_send(PG_FUNCTION_ARGS);
- extern Datum anynonarray_in(PG_FUNCTION_ARGS);
- extern Datum anynonarray_out(PG_FUNCTION_ARGS);
- extern Datum anyenum_in(PG_FUNCTION_ARGS);
- extern Datum anyenum_out(PG_FUNCTION_ARGS);
- extern Datum anyrange_in(PG_FUNCTION_ARGS);
- extern Datum anyrange_out(PG_FUNCTION_ARGS);
- extern Datum void_in(PG_FUNCTION_ARGS);
- extern Datum void_out(PG_FUNCTION_ARGS);
- extern Datum void_recv(PG_FUNCTION_ARGS);
- extern Datum void_send(PG_FUNCTION_ARGS);
 +#ifdef PGXC
 +extern Datum pgxc_node_str (PG_FUNCTION_ARGS);
 +extern Datum pgxc_lock_for_backup (PG_FUNCTION_ARGS);
 +#endif
- extern Datum trigger_in(PG_FUNCTION_ARGS);
- extern Datum trigger_out(PG_FUNCTION_ARGS);
- extern Datum event_trigger_in(PG_FUNCTION_ARGS);
- extern Datum event_trigger_out(PG_FUNCTION_ARGS);
- extern Datum language_handler_in(PG_FUNCTION_ARGS);
- extern Datum language_handler_out(PG_FUNCTION_ARGS);
- extern Datum fdw_handler_in(PG_FUNCTION_ARGS);
- extern Datum fdw_handler_out(PG_FUNCTION_ARGS);
- extern Datum index_am_handler_in(PG_FUNCTION_ARGS);
- extern Datum index_am_handler_out(PG_FUNCTION_ARGS);
- extern Datum tsm_handler_in(PG_FUNCTION_ARGS);
- extern Datum tsm_handler_out(PG_FUNCTION_ARGS);
- extern Datum internal_in(PG_FUNCTION_ARGS);
- extern Datum internal_out(PG_FUNCTION_ARGS);
- extern Datum opaque_in(PG_FUNCTION_ARGS);
- extern Datum opaque_out(PG_FUNCTION_ARGS);
- extern Datum anyelement_in(PG_FUNCTION_ARGS);
- extern Datum anyelement_out(PG_FUNCTION_ARGS);
- extern Datum shell_in(PG_FUNCTION_ARGS);
- extern Datum shell_out(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_in(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_out(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_recv(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_send(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_in(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_out(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_recv(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_send(PG_FUNCTION_ARGS);
+ extern int    oid_cmp(const void *p1, const void *p2);
  
  /* regexp.c */
- extern Datum nameregexeq(PG_FUNCTION_ARGS);
- extern Datum nameregexne(PG_FUNCTION_ARGS);
- extern Datum textregexeq(PG_FUNCTION_ARGS);
- extern Datum textregexne(PG_FUNCTION_ARGS);
- extern Datum nameicregexeq(PG_FUNCTION_ARGS);
- extern Datum nameicregexne(PG_FUNCTION_ARGS);
- extern Datum texticregexeq(PG_FUNCTION_ARGS);
- extern Datum texticregexne(PG_FUNCTION_ARGS);
- extern Datum textregexsubstr(PG_FUNCTION_ARGS);
- extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
- extern Datum textregexreplace(PG_FUNCTION_ARGS);
- extern Datum similar_escape(PG_FUNCTION_ARGS);
- extern Datum regexp_matches(PG_FUNCTION_ARGS);
- extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS);
  extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
                                        Oid collation, bool *exact);
  
- /* regproc.c */
- extern Datum regprocin(PG_FUNCTION_ARGS);
- extern Datum regprocout(PG_FUNCTION_ARGS);
- extern Datum to_regproc(PG_FUNCTION_ARGS);
- extern Datum to_regprocedure(PG_FUNCTION_ARGS);
- extern Datum regprocrecv(PG_FUNCTION_ARGS);
- extern Datum regprocsend(PG_FUNCTION_ARGS);
- extern Datum regprocedurein(PG_FUNCTION_ARGS);
- extern Datum regprocedureout(PG_FUNCTION_ARGS);
- extern Datum regprocedurerecv(PG_FUNCTION_ARGS);
- extern Datum regproceduresend(PG_FUNCTION_ARGS);
- extern Datum regoperin(PG_FUNCTION_ARGS);
- extern Datum regoperout(PG_FUNCTION_ARGS);
- extern Datum regoperrecv(PG_FUNCTION_ARGS);
- extern Datum regopersend(PG_FUNCTION_ARGS);
- extern Datum to_regoper(PG_FUNCTION_ARGS);
- extern Datum to_regoperator(PG_FUNCTION_ARGS);
- extern Datum regoperatorin(PG_FUNCTION_ARGS);
- extern Datum regoperatorout(PG_FUNCTION_ARGS);
- extern Datum regoperatorrecv(PG_FUNCTION_ARGS);
- extern Datum regoperatorsend(PG_FUNCTION_ARGS);
- extern Datum regclassin(PG_FUNCTION_ARGS);
- extern Datum regclassout(PG_FUNCTION_ARGS);
- extern Datum regclassrecv(PG_FUNCTION_ARGS);
- extern Datum regclasssend(PG_FUNCTION_ARGS);
- extern Datum to_regclass(PG_FUNCTION_ARGS);
- extern Datum regtypein(PG_FUNCTION_ARGS);
- extern Datum regtypeout(PG_FUNCTION_ARGS);
- extern Datum regtyperecv(PG_FUNCTION_ARGS);
- extern Datum regtypesend(PG_FUNCTION_ARGS);
- extern Datum to_regtype(PG_FUNCTION_ARGS);
- extern Datum regrolein(PG_FUNCTION_ARGS);
- extern Datum regroleout(PG_FUNCTION_ARGS);
- extern Datum regrolerecv(PG_FUNCTION_ARGS);
- extern Datum regrolesend(PG_FUNCTION_ARGS);
- extern Datum to_regrole(PG_FUNCTION_ARGS);
- extern Datum regnamespacein(PG_FUNCTION_ARGS);
- extern Datum regnamespaceout(PG_FUNCTION_ARGS);
- extern Datum regnamespacerecv(PG_FUNCTION_ARGS);
- extern Datum regnamespacesend(PG_FUNCTION_ARGS);
- extern Datum to_regnamespace(PG_FUNCTION_ARGS);
- extern Datum regconfigin(PG_FUNCTION_ARGS);
- extern Datum regconfigout(PG_FUNCTION_ARGS);
- extern Datum regconfigrecv(PG_FUNCTION_ARGS);
- extern Datum regconfigsend(PG_FUNCTION_ARGS);
- extern Datum regdictionaryin(PG_FUNCTION_ARGS);
- extern Datum regdictionaryout(PG_FUNCTION_ARGS);
- extern Datum regdictionaryrecv(PG_FUNCTION_ARGS);
- extern Datum regdictionarysend(PG_FUNCTION_ARGS);
- extern Datum text_regclass(PG_FUNCTION_ARGS);
- extern List *stringToQualifiedNameList(const char *string);
- extern char *format_procedure(Oid procedure_oid);
- extern char *format_procedure_qualified(Oid procedure_oid);
- extern void format_procedure_parts(Oid operator_oid, List **objnames,
-                                          List **objargs);
- extern char *format_operator(Oid operator_oid);
- extern char *format_operator_qualified(Oid operator_oid);
- extern void format_operator_parts(Oid operator_oid, List **objnames,
-                                         List **objargs);
- /* rowtypes.c */
- extern Datum record_in(PG_FUNCTION_ARGS);
- extern Datum record_out(PG_FUNCTION_ARGS);
- extern Datum record_recv(PG_FUNCTION_ARGS);
- extern Datum record_send(PG_FUNCTION_ARGS);
- extern Datum record_eq(PG_FUNCTION_ARGS);
- extern Datum record_ne(PG_FUNCTION_ARGS);
- extern Datum record_lt(PG_FUNCTION_ARGS);
- extern Datum record_gt(PG_FUNCTION_ARGS);
- extern Datum record_le(PG_FUNCTION_ARGS);
- extern Datum record_ge(PG_FUNCTION_ARGS);
- extern Datum btrecordcmp(PG_FUNCTION_ARGS);
- extern Datum record_image_eq(PG_FUNCTION_ARGS);
- extern Datum record_image_ne(PG_FUNCTION_ARGS);
- extern Datum record_image_lt(PG_FUNCTION_ARGS);
- extern Datum record_image_gt(PG_FUNCTION_ARGS);
- extern Datum record_image_le(PG_FUNCTION_ARGS);
- extern Datum record_image_ge(PG_FUNCTION_ARGS);
- extern Datum btrecordimagecmp(PG_FUNCTION_ARGS);
  /* ruleutils.c */
  extern bool quote_all_identifiers;
- extern Datum pg_get_ruledef(PG_FUNCTION_ARGS);
- extern Datum pg_get_ruledef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_wrap(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_indexdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_indexdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_triggerdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_triggerdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_constraintdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_constraintdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_expr(PG_FUNCTION_ARGS);
- extern Datum pg_get_expr_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_userbyid(PG_FUNCTION_ARGS);
- extern Datum pg_get_serial_sequence(PG_FUNCTION_ARGS);
- extern Datum pg_get_functiondef(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_arguments(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_identity_arguments(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_result(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_arg_default(PG_FUNCTION_ARGS);
 +#ifdef PGXC
 +extern void get_query_def_from_valuesList(Query *query, StringInfo buf);
 +extern void deparse_query(Query *query, StringInfo buf, List *parentnamespace,
 +              bool finalise_aggs, bool sortgroup_colno);
 +#endif
 +#ifdef PGXC
 +extern List *deparse_context_for_plan(Node *plan, List *ancestors,
 +                                                        List *rtable);
 +#endif
  extern const char *quote_identifier(const char *ident);
  extern char *quote_qualified_identifier(const char *qualifier,
                                                   const char *ident);
@@@ -1179,186 -119,6 +136,21 @@@ extern char *format_type_with_typemod_q
  extern int32 type_maximum_size(Oid type_oid, int32 typemod);
  
  /* quote.c */
- extern Datum quote_ident(PG_FUNCTION_ARGS);
- extern Datum quote_literal(PG_FUNCTION_ARGS);
  extern char *quote_literal_cstr(const char *rawstr);
- extern Datum quote_nullable(PG_FUNCTION_ARGS);
- /* guc.c */
- extern Datum show_config_by_name(PG_FUNCTION_ARGS);
- extern Datum show_config_by_name_missing_ok(PG_FUNCTION_ARGS);
- extern Datum set_config_by_name(PG_FUNCTION_ARGS);
- extern Datum show_all_settings(PG_FUNCTION_ARGS);
- extern Datum show_all_file_settings(PG_FUNCTION_ARGS);
- /* pg_config.c */
- extern Datum pg_config(PG_FUNCTION_ARGS);
- /* pg_controldata.c */
- extern Datum pg_control_checkpoint(PG_FUNCTION_ARGS);
- extern Datum pg_control_system(PG_FUNCTION_ARGS);
- extern Datum pg_control_init(PG_FUNCTION_ARGS);
- extern Datum pg_control_recovery(PG_FUNCTION_ARGS);
- /* rls.c */
- extern Datum row_security_active(PG_FUNCTION_ARGS);
- extern Datum row_security_active_name(PG_FUNCTION_ARGS);
- /* lockfuncs.c */
- extern Datum pg_lock_status(PG_FUNCTION_ARGS);
- extern Datum pg_blocking_pids(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_all(PG_FUNCTION_ARGS);
- /* txid.c */
- extern Datum txid_snapshot_in(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_out(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_recv(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_send(PG_FUNCTION_ARGS);
- extern Datum txid_current(PG_FUNCTION_ARGS);
- extern Datum txid_current_snapshot(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xmin(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xmax(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xip(PG_FUNCTION_ARGS);
- extern Datum txid_visible_in_snapshot(PG_FUNCTION_ARGS);
- /* uuid.c */
- extern Datum uuid_in(PG_FUNCTION_ARGS);
- extern Datum uuid_out(PG_FUNCTION_ARGS);
- extern Datum uuid_send(PG_FUNCTION_ARGS);
- extern Datum uuid_recv(PG_FUNCTION_ARGS);
- extern Datum uuid_lt(PG_FUNCTION_ARGS);
- extern Datum uuid_le(PG_FUNCTION_ARGS);
- extern Datum uuid_eq(PG_FUNCTION_ARGS);
- extern Datum uuid_ge(PG_FUNCTION_ARGS);
- extern Datum uuid_gt(PG_FUNCTION_ARGS);
- extern Datum uuid_ne(PG_FUNCTION_ARGS);
- extern Datum uuid_cmp(PG_FUNCTION_ARGS);
- extern Datum uuid_sortsupport(PG_FUNCTION_ARGS);
- extern Datum uuid_hash(PG_FUNCTION_ARGS);
- /* windowfuncs.c */
- extern Datum window_row_number(PG_FUNCTION_ARGS);
- extern Datum window_rank(PG_FUNCTION_ARGS);
- extern Datum window_dense_rank(PG_FUNCTION_ARGS);
- extern Datum window_percent_rank(PG_FUNCTION_ARGS);
- extern Datum window_cume_dist(PG_FUNCTION_ARGS);
- extern Datum window_ntile(PG_FUNCTION_ARGS);
- extern Datum window_lag(PG_FUNCTION_ARGS);
- extern Datum window_lag_with_offset(PG_FUNCTION_ARGS);
- extern Datum window_lag_with_offset_and_default(PG_FUNCTION_ARGS);
- extern Datum window_lead(PG_FUNCTION_ARGS);
- extern Datum window_lead_with_offset(PG_FUNCTION_ARGS);
- extern Datum window_lead_with_offset_and_default(PG_FUNCTION_ARGS);
- extern Datum window_first_value(PG_FUNCTION_ARGS);
- extern Datum window_last_value(PG_FUNCTION_ARGS);
- extern Datum window_nth_value(PG_FUNCTION_ARGS);
- /* access/spgist/spgquadtreeproc.c */
- extern Datum spg_quad_config(PG_FUNCTION_ARGS);
- extern Datum spg_quad_choose(PG_FUNCTION_ARGS);
- extern Datum spg_quad_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_quad_inner_consistent(PG_FUNCTION_ARGS);
- extern Datum spg_quad_leaf_consistent(PG_FUNCTION_ARGS);
- /* access/spgist/spgkdtreeproc.c */
- extern Datum spg_kd_config(PG_FUNCTION_ARGS);
- extern Datum spg_kd_choose(PG_FUNCTION_ARGS);
- extern Datum spg_kd_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_kd_inner_consistent(PG_FUNCTION_ARGS);
- /* access/spgist/spgtextproc.c */
- extern Datum spg_text_config(PG_FUNCTION_ARGS);
- extern Datum spg_text_choose(PG_FUNCTION_ARGS);
- extern Datum spg_text_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_text_inner_consistent(PG_FUNCTION_ARGS);
- extern Datum spg_text_leaf_consistent(PG_FUNCTION_ARGS);
- /* access/gin/ginarrayproc.c */
- extern Datum ginarrayextract(PG_FUNCTION_ARGS);
- extern Datum ginarrayextract_2args(PG_FUNCTION_ARGS);
- extern Datum ginqueryarrayextract(PG_FUNCTION_ARGS);
- extern Datum ginarrayconsistent(PG_FUNCTION_ARGS);
- extern Datum ginarraytriconsistent(PG_FUNCTION_ARGS);
- /* access/tablesample/bernoulli.c */
- extern Datum tsm_bernoulli_handler(PG_FUNCTION_ARGS);
- /* access/tablesample/system.c */
- extern Datum tsm_system_handler(PG_FUNCTION_ARGS);
- /* access/transam/twophase.c */
- extern Datum pg_prepared_xact(PG_FUNCTION_ARGS);
- /* access/transam/multixact.c */
- extern Datum pg_get_multixact_members(PG_FUNCTION_ARGS);
- /* access/transam/committs.c */
- extern Datum pg_xact_commit_timestamp(PG_FUNCTION_ARGS);
- extern Datum pg_last_committed_xact(PG_FUNCTION_ARGS);
- /* catalogs/dependency.c */
- extern Datum pg_describe_object(PG_FUNCTION_ARGS);
- extern Datum pg_identify_object(PG_FUNCTION_ARGS);
- extern Datum pg_identify_object_as_address(PG_FUNCTION_ARGS);
- /* catalog/objectaddress.c */
- extern Datum pg_get_object_address(PG_FUNCTION_ARGS);
- /* commands/constraint.c */
- extern Datum unique_key_recheck(PG_FUNCTION_ARGS);
- /* commands/event_trigger.c */
- extern Datum pg_event_trigger_dropped_objects(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_table_rewrite_oid(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_table_rewrite_reason(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_ddl_commands(PG_FUNCTION_ARGS);
- /* commands/extension.c */
- extern Datum pg_available_extensions(PG_FUNCTION_ARGS);
- extern Datum pg_available_extension_versions(PG_FUNCTION_ARGS);
- extern Datum pg_extension_update_paths(PG_FUNCTION_ARGS);
- extern Datum pg_extension_config_dump(PG_FUNCTION_ARGS);
- /* commands/prepare.c */
- extern Datum pg_prepared_statement(PG_FUNCTION_ARGS);
- /* utils/mmgr/portalmem.c */
- extern Datum pg_cursor(PG_FUNCTION_ARGS);
  
 +#ifdef PGXC
 +/* backend/pgxc/pool/poolutils.c */
 +extern Datum pgxc_pool_check(PG_FUNCTION_ARGS);
 +extern Datum pgxc_pool_reload(PG_FUNCTION_ARGS);
 +
 +/* backend/access/transam/transam.c */
 +extern Datum pgxc_is_committed(PG_FUNCTION_ARGS);
 +extern Datum pgxc_is_inprogress(PG_FUNCTION_ARGS);
 +#endif
 +extern Datum pg_msgmodule_set(PG_FUNCTION_ARGS);
 +extern Datum pg_msgmodule_change(PG_FUNCTION_ARGS);
 +extern Datum pg_msgmodule_enable(PG_FUNCTION_ARGS);
 +extern Datum pg_msgmodule_disable(PG_FUNCTION_ARGS);
 +extern Datum pg_msgmodule_enable_all(PG_FUNCTION_ARGS);
 +extern Datum pg_msgmodule_disable_all(PG_FUNCTION_ARGS);
  #endif   /* BUILTINS_H */
Simple merge
index 23288d9806074a9088962e4fc84a606d12282512,87d07410845a70d28da2a87e3bc361c8a15b93a0..144281aa244bde9bda71f5cf9187bfafcd96ef6e
@@@ -4,8 -4,7 +4,8 @@@
   * External declarations pertaining to backend/utils/misc/guc.c and
   * backend/utils/misc/guc-file.l
   *
-  * Copyright (c) 2000-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Copyright (c) 2000-2017, PostgreSQL Global Development Group
   * Written by Peter Eisentraut <[email protected]>.
   *
   * src/include/utils/guc.h
@@@ -243,14 -242,8 +243,13 @@@ extern bool log_executor_stats
  extern bool log_statement_stats;
  extern bool log_btree_build_stats;
  
 +#ifdef XCP
 +extern bool log_gtm_stats;
 +extern bool log_remotesubplan_stats;
 +#endif
 +
  extern PGDLLIMPORT bool check_function_bodies;
  extern bool default_with_oids;
- extern bool SQL_inheritance;
  
  extern int    log_min_error_statement;
  extern int    log_min_messages;
Simple merge
index f72233c33557ea25a04e120f2b96ce87808cf1cb,93588df9f74a894894c431e28df8553a643724b2..8da59bbf3eb7278fb7783256e5c34e2e06df061a
@@@ -3,8 -3,7 +3,8 @@@
   * lsyscache.h
   *      Convenience routines for common queries in the system catalog cache.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/utils/lsyscache.h
@@@ -154,41 -168,12 +177,36 @@@ extern Oid      get_typcollation(Oid typid)
  extern bool type_is_collatable(Oid typid);
  extern Oid    getBaseType(Oid typid);
  extern Oid    getBaseTypeAndTypmod(Oid typid, int32 *typmod);
 +#ifdef PGXC
 +extern char *get_typename(Oid typid);
 +extern char *get_pgxc_nodename(Oid nodeoid);
 +extern Oid    get_pgxc_nodeoid(const char *nodename);
 +extern uint32 get_pgxc_node_id(Oid nodeid);
 +extern char   get_pgxc_nodetype(Oid nodeid);
 +extern int    get_pgxc_nodeport(Oid nodeid);
 +extern char *get_pgxc_nodehost(Oid nodeid);
 +extern bool   is_pgxc_nodepreferred(Oid nodeid);
 +extern bool   is_pgxc_nodeprimary(Oid nodeid);
 +extern Oid    get_pgxc_groupoid(const char *groupname);
 +extern int    get_pgxc_groupmembers(Oid groupid, Oid **members);
 +extern int    get_pgxc_classnodes(Oid tableid, Oid **nodes);
++extern char * get_pgxc_groupname(Oid groupid);
 +#endif
  extern int32 get_typavgwidth(Oid typid, int32 typmod);
  extern int32 get_attavgwidth(Oid relid, AttrNumber attnum);
- extern bool get_attstatsslot(HeapTuple statstuple,
-                                Oid atttype, int32 atttypmod,
-                                int reqkind, Oid reqop,
-                                Oid *actualop,
-                                Datum **values, int *nvalues,
-                                float4 **numbers, int *nnumbers);
- extern void free_attstatsslot(Oid atttype,
-                                 Datum *values, int nvalues,
-                                 float4 *numbers, int nnumbers);
+ extern bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple,
+                                int reqkind, Oid reqop, int flags);
+ extern void free_attstatsslot(AttStatsSlot *sslot);
  extern char *get_namespace_name(Oid nspid);
 +#ifdef XCP
 +extern Oid    get_namespaceid(const char *nspname);
 +extern char *get_typ_name(Oid typid);
 +extern Oid    get_typ_namespace(Oid typid);
 +extern Oid    get_typname_typid(const char *typname, Oid typnamespace);
 +extern Oid    get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp);
 +extern Oid    get_opnamespace(Oid opno);
 +extern Oid    get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp);
 +#endif
  extern char *get_namespace_name_or_temp(Oid nspid);
  extern Oid    get_range_subtype(Oid rangeOid);
  
index 41b10ab37260cab994947c8c4fbb85f66cb94bda,a129f2c652ddcad87b0c152cd729f40ea17c14c7..fbb271ed1c719cb48efa8a528750616a97ece24b
@@@ -5,8 -5,7 +5,8 @@@
   *
   * See plancache.c for comments.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/utils/plancache.h
@@@ -148,13 -147,10 +151,13 @@@ typedef struct CachedPla
  extern void InitPlanCache(void);
  extern void ResetPlanCache(void);
  
- extern CachedPlanSource *CreateCachedPlan(Node *raw_parse_tree,
+ extern CachedPlanSource *CreateCachedPlan(struct RawStmt *raw_parse_tree,
                                 const char *query_string,
 +#ifdef PGXC
 +                               const char *stmt_name,
 +#endif
                                 const char *commandTag);
- extern CachedPlanSource *CreateOneShotCachedPlan(Node *raw_parse_tree,
+ extern CachedPlanSource *CreateOneShotCachedPlan(struct RawStmt *raw_parse_tree,
                                                const char *query_string,
                                                const char *commandTag);
  extern void CompleteCachedPlan(CachedPlanSource *plansource,
@@@ -181,11 -178,8 +185,12 @@@ extern List *CachedPlanGetTargetList(Ca
  
  extern CachedPlan *GetCachedPlan(CachedPlanSource *plansource,
                          ParamListInfo boundParams,
-                         bool useResOwner);
+                         bool useResOwner,
+                         QueryEnvironment *queryEnv);
  extern void ReleaseCachedPlan(CachedPlan *plan, bool useResOwner);
 +#ifdef XCP
 +extern void SetRemoteSubplan(CachedPlanSource *plansource,
 +                               const char *plan_string);
 +#endif
  
  #endif   /* PLANCACHE_H */
index ba11a5cf5f2fbfa921ea274fabf0f51a9f2e59db,ef3898c98cb55c2c0e8aacbf845d9a77194bf82f..c9ce886483fc99fa6b229e7a945e316de36e17f4
@@@ -36,8 -36,7 +36,8 @@@
   * to look like NO SCROLL cursors.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/utils/portal.h
@@@ -240,16 -233,9 +241,16 @@@ extern void PortalDefineQuery(Portal po
                                  const char *commandTag,
                                  List *stmts,
                                  CachedPlan *cplan);
- extern Node *PortalListGetPrimaryStmt(List *stmts);
+ extern PlannedStmt *PortalGetPrimaryStmt(Portal portal);
  extern void PortalCreateHoldStore(Portal portal);
  extern void PortalHashTableDeleteAll(void);
 +#ifdef XCP
 +extern void PortalCreateProducerStore(Portal portal);
 +extern List *getProducingPortals(void);
 +extern void addProducingPortal(Portal portal);
 +extern void removeProducingPortal(Portal portal);
 +extern bool portalIsProducing(Portal portal);
 +#endif
  extern bool ThereAreNoReadyPortals(void);
  
  #endif   /* PORTAL_H */
index 6145b0ecc121975c53857a6d004e14e595ea5a2d,84768969d32d9611fa2069824d9c91c10a042485..dd5de58fefaa1f18c67f7a589a197d37661291e1
@@@ -4,10 -4,8 +4,10 @@@
   *      POSTGRES relation descriptor (a/k/a relcache entry) definitions.
   *
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/utils/rel.h
   *
  #include "access/xlog.h"
  #include "catalog/pg_class.h"
  #include "catalog/pg_index.h"
+ #include "catalog/pg_publication.h"
  #include "fmgr.h"
  #include "nodes/bitmapset.h"
 +#include "pgxc/locator.h"
  #include "rewrite/prs2lock.h"
  #include "storage/block.h"
  #include "storage/relfilenode.h"
Simple merge
index 7cc6f3894d5f58da6b2495858af3990e317a4f4e,2bcaa42277e362f95422c2ed2c3dd6aba11ef0c1..a981d90616d1b878576c17f975ffb036979afd32
@@@ -3,9 -3,8 +3,9 @@@
   * snapshot.h
   *      POSTGRES snapshot definition
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/utils/snapshot.h
   *
index 5a16368f12f7415c04b5868ba1de0a9acc3c900b,e20284d06155aa3407086aaacfd5c2c329a6b92c..2bbc93a975c6954ddc7c10adb9eaaa6880b21db5
@@@ -6,9 -6,8 +6,9 @@@
   * See also lsyscache.h, which provides convenience routines for
   * common cache-lookup operations.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
 + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * src/include/utils/syscache.h
   *
@@@ -73,14 -72,7 +73,15 @@@ enum SysCacheIdentifie
        OPEROID,
        OPFAMILYAMNAMENSP,
        OPFAMILYOID,
 +#ifdef PGXC
 +      PGXCCLASSRELID,
 +      PGXCGROUPNAME,
 +      PGXCGROUPOID,
 +      PGXCNODENAME,
 +      PGXCNODEOID,
 +      PGXCNODEIDENTIFIER,
 +#endif
+       PARTRELID,
        PROCNAMEARGSNSP,
        PROCOID,
        RANGETYPE,
index 3e8f4855d05d4d156a27f4650d3c3830bc540b0b,7fd1c7692ff0ed3f8ab12aee848718a27df362f9..5e7e353b73ae55f44cdad9952b875c88cc73dc2a
  #define INTERVAL_PRECISION(t) ((t) & INTERVAL_PRECISION_MASK)
  #define INTERVAL_RANGE(t) (((t) >> 16) & INTERVAL_RANGE_MASK)
  
- #ifdef HAVE_INT64_TIMESTAMP
  #define TimestampTzPlusMilliseconds(tz,ms) ((tz) + ((ms) * (int64) 1000))
- #else
- #define TimestampTzPlusMilliseconds(tz,ms) ((tz) + ((ms) / 1000.0))
- #endif
  
 +#ifdef PGXC
 +#define InvalidGlobalTimestamp ((TimestampTz) 0)
 +#define GlobalTimestampIsValid(timestamp) ((TimestampTz) (timestamp)) != InvalidGlobalTimestamp
 +#endif
  
  /* Set at postmaster start */
  extern TimestampTz PgStartTime;
index ac46f90c4ade6120ecdbf20e7beee12eff13cc2f,14b9026fb7ffcb10555bcd60b31b31ef22b5825d..d6be7fe826fb4ddf3ce6805b1aa0d886d0841c59
@@@ -10,8 -10,7 +10,8 @@@
   * amounts are sorted using temporary files and a standard external sort
   * algorithm.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/utils/tuplesort.h
index 1eecc89bcf55c24134329fa55a9b817607861693,b31ede882b93a40408d86b4b40471d2733aab8da..aef4fc984040da19108568b2513d708600e804b9
@@@ -21,8 -21,7 +21,8 @@@
   * Also, we have changed the API to return tuples in TupleTableSlots,
   * so that there is a check to prevent attempted access to system columns.
   *
-  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 + * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * src/include/utils/tuplestore.h
Simple merge
index 31ae38e395b08eba1ed31acde6647026afcd7741,7a40c99ce03795b1728bd10510ff6e7a069c1f42..0d75a99361d7eab933bba0181b0a9143b8761622
  #include "utils/rel.h"
  #include "utils/snapmgr.h"
  #include "utils/typcache.h"
 +#ifdef XCP
 +#include "pgxc/pgxc.h"
 +#endif
  
+ #include "plpgsql.h"
  
  typedef struct
  {
Simple merge
Simple merge
Simple merge
index 47e4f38d75f19cf7467d37756d568edca1a59bc6,ce6b841a331d9ec81874bbb706eebf4988460eb0..e428dcfb44c25e9c00feb2db487c56722d5b0e7c
@@@ -774,12 -820,11 +849,13 @@@ explain (costs off, nodes off
     Sort Key: (generate_series(1, 3)) DESC
     InitPlan 1 (returns $0)
       ->  Limit
 -           ->  Index Only Scan Backward using tenk1_unique2 on tenk1
 -                 Index Cond: (unique2 IS NOT NULL)
 +           ->  Remote Subquery Scan on all
 +                 ->  Limit
 +                       ->  Index Only Scan Backward using tenk1_unique2 on tenk1
 +                             Index Cond: (unique2 IS NOT NULL)
-    ->  Result
- (9 rows)
+    ->  ProjectSet
+          ->  Result
 -(8 rows)
++(10 rows)
  
  select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
   max  | g 
index 2d63e197bec2ba04e9295376938f2881edddf10b,62347bc47e84a0ce79ce2b42f04271c3b9abc7a6..65b2c2245e6cd2b2636c0d29abd501443c8bf2b1
@@@ -636,53 -673,13 +679,15 @@@ SELECT nspname, prsnam
  ---
  --- Cleanup resources
  ---
+ set client_min_messages to warning; -- suppress cascade notices
  DROP FOREIGN DATA WRAPPER alt_fdw2 CASCADE;
 +ERROR:  foreign-data wrapper "alt_fdw2" does not exist
  DROP FOREIGN DATA WRAPPER alt_fdw3 CASCADE;
 +ERROR:  foreign-data wrapper "alt_fdw3" does not exist
  DROP LANGUAGE alt_lang2 CASCADE;
  DROP LANGUAGE alt_lang3 CASCADE;
- DROP LANGUAGE alt_lang4 CASCADE;
- ERROR:  language "alt_lang4" does not exist
  DROP SCHEMA alt_nsp1 CASCADE;
- NOTICE:  drop cascades to 26 other objects
- DETAIL:  drop cascades to function alt_func3(integer)
- drop cascades to function alt_agg3(integer)
- drop cascades to function alt_func4(integer)
- drop cascades to function alt_func2(integer)
- drop cascades to function alt_agg4(integer)
- drop cascades to function alt_agg2(integer)
- drop cascades to conversion alt_conv3
- drop cascades to conversion alt_conv4
- drop cascades to conversion alt_conv2
- drop cascades to operator @+@(integer,integer)
- drop cascades to operator @-@(integer,integer)
- drop cascades to operator family alt_opf3 for access method hash
- drop cascades to operator family alt_opc1 for access method hash
- drop cascades to operator family alt_opc2 for access method hash
- drop cascades to operator family alt_opf4 for access method hash
- drop cascades to operator family alt_opf2 for access method hash
- drop cascades to text search dictionary alt_ts_dict3
- drop cascades to text search dictionary alt_ts_dict4
- drop cascades to text search dictionary alt_ts_dict2
- drop cascades to text search configuration alt_ts_conf3
- drop cascades to text search configuration alt_ts_conf4
- drop cascades to text search configuration alt_ts_conf2
- drop cascades to text search template alt_ts_temp3
- drop cascades to text search template alt_ts_temp2
- drop cascades to text search parser alt_ts_prs3
- drop cascades to text search parser alt_ts_prs2
  DROP SCHEMA alt_nsp2 CASCADE;
- NOTICE:  drop cascades to 9 other objects
- DETAIL:  drop cascades to function alt_nsp2.alt_func2(integer)
- drop cascades to function alt_nsp2.alt_agg2(integer)
- drop cascades to conversion alt_conv2
- drop cascades to operator alt_nsp2.@-@(integer,integer)
- drop cascades to operator family alt_nsp2.alt_opf2 for access method hash
- drop cascades to text search dictionary alt_ts_dict2
- drop cascades to text search configuration alt_ts_conf2
- drop cascades to text search template alt_ts_temp2
- drop cascades to text search parser alt_ts_prs2
  DROP USER regress_alter_user1;
  DROP USER regress_alter_user2;
  DROP USER regress_alter_user3;
index c54f6753d078d463f59042e2c5089d5b3c8b4629,8aadbb88a348571dd2fe6e22d1e2a2f72380d35b..0a1068146acabd694be2907b6114637ad8a6c17e
@@@ -365,6 -413,29 +414,26 @@@ ALTER TABLE tmp7 ADD CONSTRAINT identit
  ALTER TABLE tmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID;
  NOTICE:  merging constraint "identity" with inherited definition
  ALTER TABLE tmp3 VALIDATE CONSTRAINT identity;
 -NOTICE:  boo: 16
 -NOTICE:  boo: 20
+ -- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT
+ create table parent_noinh_convalid (a int);
+ create table child_noinh_convalid () inherits (parent_noinh_convalid);
+ insert into parent_noinh_convalid values (1);
+ insert into child_noinh_convalid values (1);
+ alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid;
+ -- fail, because of the row in parent
+ alter table parent_noinh_convalid validate constraint check_a_is_2;
+ ERROR:  check constraint "check_a_is_2" is violated by some row
+ delete from only parent_noinh_convalid;
+ -- ok (parent itself contains no violating rows)
+ alter table parent_noinh_convalid validate constraint check_a_is_2;
+ select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2';
+  convalidated 
+ --------------
+  t
+ (1 row)
+ -- cleanup
+ drop table parent_noinh_convalid, child_noinh_convalid;
  -- Try (and fail) to create constraint from tmp5(a) to tmp4(a) - unique constraint on
  -- tmp4 is a,b
  ALTER TABLE tmp5 add constraint tmpconstr foreign key(a) references tmp4(a) match full;
@@@ -531,7 -601,68 +600,67 @@@ ERROR:  Hash/Modulo distribution colum
  -- As does this...
  ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1)
       references pktable(ptest1, ptest2);
 -ERROR:  foreign key constraint "fktable_ftest2_fkey" cannot be implemented
 -DETAIL:  Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer.
 +ERROR:  Hash/Modulo distribution column does not refer to hash/modulo distribution column in referenced table.
+ DROP TABLE FKTABLE;
+ DROP TABLE PKTABLE;
+ -- Test that ALTER CONSTRAINT updates trigger deferrability properly
+ CREATE TEMP TABLE PKTABLE (ptest1 int primary key);
+ CREATE TEMP TABLE FKTABLE (ftest1 int);
+ ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable
+   ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE;
+ SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+ FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+ WHERE tgrelid = 'pktable'::regclass
+ ORDER BY 1,2,3;
+  conname |         tgfoid         | tgtype | tgdeferrable | tginitdeferred 
+ ---------+------------------------+--------+--------------+----------------
+  fkdd    | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdd    | "RI_FKey_noaction_upd" |     17 | t            | t
+  fkdd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdd2   | "RI_FKey_noaction_upd" |     17 | t            | t
+  fkdi    | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdi    | "RI_FKey_noaction_upd" |     17 | t            | f
+  fkdi2   | "RI_FKey_cascade_del"  |      9 | f            | f
+  fkdi2   | "RI_FKey_noaction_upd" |     17 | t            | f
+  fknd    | "RI_FKey_cascade_del"  |      9 | f            | f
+  fknd    | "RI_FKey_noaction_upd" |     17 | f            | f
+  fknd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+  fknd2   | "RI_FKey_noaction_upd" |     17 | f            | f
+ (12 rows)
+ SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+ FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+ WHERE tgrelid = 'fktable'::regclass
+ ORDER BY 1,2,3;
+  conname |       tgfoid        | tgtype | tgdeferrable | tginitdeferred 
+ ---------+---------------------+--------+--------------+----------------
+  fkdd    | "RI_FKey_check_ins" |      5 | t            | t
+  fkdd    | "RI_FKey_check_upd" |     17 | t            | t
+  fkdd2   | "RI_FKey_check_ins" |      5 | t            | t
+  fkdd2   | "RI_FKey_check_upd" |     17 | t            | t
+  fkdi    | "RI_FKey_check_ins" |      5 | t            | f
+  fkdi    | "RI_FKey_check_upd" |     17 | t            | f
+  fkdi2   | "RI_FKey_check_ins" |      5 | t            | f
+  fkdi2   | "RI_FKey_check_upd" |     17 | t            | f
+  fknd    | "RI_FKey_check_ins" |      5 | f            | f
+  fknd    | "RI_FKey_check_upd" |     17 | f            | f
+  fknd2   | "RI_FKey_check_ins" |      5 | f            | f
+  fknd2   | "RI_FKey_check_upd" |     17 | f            | f
+ (12 rows)
  -- temp tables should go away by themselves, need not drop them.
  -- test check constraint adding
  create table atacc1 ( test int );
index 4a9f2ee1fda13daa1ab0bdf0394445e1bc2ec6a9,c730563f0386ca593c0f894a9f596de86ab833e2..15ef18a0d31c3c0c395618f547b29ba89a232cbb
@@@ -1206,13 -1288,15 +1294,20 @@@ select 33 = all ('{33,null,33}')
   
  (1 row)
  
+ -- nulls later in the bitmap
+ SELECT -1 != ALL(ARRAY(SELECT NULLIF(g.i, 900) FROM generate_series(1,1000) g(i)));
+  ?column? 
+ ----------
+  
+ (1 row)
  -- test indexes on arrays
 -create temp table arr_tbl (f1 int[] unique);
 +-- PGXCTODO: related to feature request 3520520, this distribution type is changed
 +-- to replication. As integer arrays are no available distribution types, this table
 +-- should use roundrobin distribution if nothing is specified but roundrobin
 +-- distribution cannot be safely used to check constraints on remote nodes.
 +-- When global constraints are supported, this replication distribution should be removed.
 +create temp table arr_tbl (f1 int[] unique) distribute by replication;
  insert into arr_tbl values ('{1,2,3}');
  insert into arr_tbl values ('{1,2}');
  -- failure expected:
Simple merge
Simple merge
Simple merge
Simple merge
index 5977563902d89eed769b911173a0822c7b9012dd,65e9c626b3597c9afeaaf733d3e9a4636991f2de..e61918c8c610c1a225a3637a9888e950d081bbd1
@@@ -485,19 -438,21 +485,19 @@@ begi
  end $$ language plpgsql immutable;
  alter table check_con_tbl add check (check_con_function(check_con_tbl.*));
  \d+ check_con_tbl
-                     Table "public.check_con_tbl"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  f1     | integer |           | plain   |              | 
+                                Table "public.check_con_tbl"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  f1     | integer |           |          |         | plain   |              | 
  Check constraints:
      "check_con_tbl_check" CHECK (check_con_function(check_con_tbl.*))
 +Distribute By: HASH(f1)
 +Location Nodes: ALL DATANODES
  
  copy check_con_tbl from stdin;
 -NOTICE:  input = {"f1":1}
 -NOTICE:  input = {"f1":null}
  copy check_con_tbl from stdin;
 -NOTICE:  input = {"f1":0}
  ERROR:  new row for relation "check_con_tbl" violates check constraint "check_con_tbl_check"
  DETAIL:  Failing row contains (0).
 -CONTEXT:  COPY check_con_tbl, line 1: "0"
  select * from check_con_tbl;
   f1 
  ----
     
  (2 rows)
  
+ -- test with RLS enabled.
+ CREATE ROLE regress_rls_copy_user;
+ CREATE ROLE regress_rls_copy_user_colperms;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ COPY rls_t1 (a, b, c) from stdin;
+ CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0);
+ ALTER TABLE rls_t1 ENABLE ROW LEVEL SECURITY;
+ ALTER TABLE rls_t1 FORCE ROW LEVEL SECURITY;
+ GRANT SELECT ON TABLE rls_t1 TO regress_rls_copy_user;
+ GRANT SELECT (a, b) ON TABLE rls_t1 TO regress_rls_copy_user_colperms;
+ -- all columns
+ COPY rls_t1 TO stdout;
+ 1     4       1
+ 2     3       2
+ 3     2       3
+ 4     1       4
+ COPY rls_t1 (a, b, c) TO stdout;
+ 1     4       1
+ 2     3       2
+ 3     2       3
+ 4     1       4
+ -- subset of columns
+ COPY rls_t1 (a) TO stdout;
+ 1
+ 2
+ 3
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 1     4
+ 2     3
+ 3     2
+ 4     1
+ -- column reordering
+ COPY rls_t1 (b, a) TO stdout;
+ 4     1
+ 3     2
+ 2     3
+ 1     4
+ SET SESSION AUTHORIZATION regress_rls_copy_user;
+ -- all columns
+ COPY rls_t1 TO stdout;
+ 2     3       2
+ 4     1       4
+ COPY rls_t1 (a, b, c) TO stdout;
+ 2     3       2
+ 4     1       4
+ -- subset of columns
+ COPY rls_t1 (a) TO stdout;
+ 2
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 2     3
+ 4     1
+ -- column reordering
+ COPY rls_t1 (b, a) TO stdout;
+ 3     2
+ 1     4
+ RESET SESSION AUTHORIZATION;
+ SET SESSION AUTHORIZATION regress_rls_copy_user_colperms;
+ -- attempt all columns (should fail)
+ COPY rls_t1 TO stdout;
+ ERROR:  permission denied for relation rls_t1
+ COPY rls_t1 (a, b, c) TO stdout;
+ ERROR:  permission denied for relation rls_t1
+ -- try to copy column with no privileges (should fail)
+ COPY rls_t1 (c) TO stdout;
+ ERROR:  permission denied for relation rls_t1
+ -- subset of columns (should succeed)
+ COPY rls_t1 (a) TO stdout;
+ 2
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 2     3
+ 4     1
+ RESET SESSION AUTHORIZATION;
+ -- test with INSTEAD OF INSERT trigger on a view
+ CREATE TABLE instead_of_insert_tbl(id serial, name text);
+ CREATE VIEW instead_of_insert_tbl_view AS SELECT ''::text AS str;
+ COPY instead_of_insert_tbl_view FROM stdin; -- fail
+ ERROR:  cannot copy to view "instead_of_insert_tbl_view"
+ HINT:  To enable copying to a view, provide an INSTEAD OF INSERT trigger.
+ CREATE FUNCTION fun_instead_of_insert_tbl() RETURNS trigger AS $$
+ BEGIN
+   INSERT INTO instead_of_insert_tbl (name) VALUES (NEW.str);
+   RETURN NULL;
+ END;
+ $$ LANGUAGE plpgsql;
+ CREATE TRIGGER trig_instead_of_insert_tbl_view
+   INSTEAD OF INSERT ON instead_of_insert_tbl_view
+   FOR EACH ROW EXECUTE PROCEDURE fun_instead_of_insert_tbl();
+ COPY instead_of_insert_tbl_view FROM stdin;
+ SELECT * FROM instead_of_insert_tbl;
+  id | name  
+ ----+-------
+   1 | test1
+ (1 row)
+ -- clean up
  DROP TABLE forcetest;
  DROP TABLE vistest;
 +ERROR:  table "vistest" does not exist
  DROP FUNCTION truncate_in_subxact();
 +ERROR:  function truncate_in_subxact() does not exist
  DROP TABLE x, y;
+ DROP TABLE rls_t1 CASCADE;
+ DROP ROLE regress_rls_copy_user;
+ DROP ROLE regress_rls_copy_user_colperms;
  DROP FUNCTION fn_x_before();
  DROP FUNCTION fn_x_after();
+ DROP TABLE instead_of_insert_tbl;
+ DROP VIEW instead_of_insert_tbl_view;
+ DROP FUNCTION fun_instead_of_insert_tbl();
index 5de986faeb6ad7115832509b3f86e05f316ad9e3,26cd05933cab1d9740d25a0e4067986765fc9c0c..bccb6920dd20ea511f2aac5931742384773c70b1
@@@ -2481,19 -2346,16 +2477,18 @@@ DROP TABLE unlogged_hash_table
  -- maintenance_work_mem setting and fillfactor:
  SET maintenance_work_mem = '1MB';
  CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10);
- WARNING:  hash indexes are not WAL-logged and their use is discouraged
  EXPLAIN (COSTS OFF)
  SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
 -                      QUERY PLAN                       
 --------------------------------------------------------
 - Aggregate
 -   ->  Bitmap Heap Scan on tenk1
 -         Recheck Cond: (stringu1 = 'TVAAAA'::name)
 -         ->  Bitmap Index Scan on hash_tuplesort_idx
 -               Index Cond: (stringu1 = 'TVAAAA'::name)
 -(5 rows)
 +                            QUERY PLAN                             
 +-------------------------------------------------------------------
 + Finalize Aggregate
 +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
 +         ->  Partial Aggregate
 +               ->  Bitmap Heap Scan on tenk1
 +                     Recheck Cond: (stringu1 = 'TVAAAA'::name)
 +                     ->  Bitmap Index Scan on hash_tuplesort_idx
 +                           Index Cond: (stringu1 = 'TVAAAA'::name)
 +(7 rows)
  
  SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
   count 
@@@ -2592,22 -2445,36 +2587,22 @@@ REINDEX TABLE concur_heap
  DELETE FROM concur_heap WHERE f1 = 'b';
  VACUUM FULL concur_heap;
  \d concur_heap
- Table "public.concur_heap"
-  Column | Type | Modifiers 
- --------+------+-----------
-  f1     | text | 
-  f2     | text | 
           Table "public.concur_heap"
+  Column | Type | Collation | Nullable | Default 
+ --------+------+-----------+----------+---------
+  f1     | text |           |          | 
+  f2     | text |           |          | 
  Indexes:
 -    "concur_index2" UNIQUE, btree (f1)
 -    "concur_index3" UNIQUE, btree (f2) INVALID
 -    "concur_heap_expr_idx" btree ((f2 || f1))
 -    "concur_index1" btree (f2, f1)
 -    "concur_index4" btree (f2) WHERE f1 = 'a'::text
 -    "concur_index5" btree (f2) WHERE f1 = 'x'::text
      "std_index" btree (f2)
  
  REINDEX TABLE concur_heap;
  \d concur_heap
- Table "public.concur_heap"
-  Column | Type | Modifiers 
- --------+------+-----------
-  f1     | text | 
-  f2     | text | 
           Table "public.concur_heap"
+  Column | Type | Collation | Nullable | Default 
+ --------+------+-----------+----------+---------
+  f1     | text |           |          | 
+  f2     | text |           |          | 
  Indexes:
 -    "concur_index2" UNIQUE, btree (f1)
 -    "concur_index3" UNIQUE, btree (f2)
 -    "concur_heap_expr_idx" btree ((f2 || f1))
 -    "concur_index1" btree (f2, f1)
 -    "concur_index4" btree (f2) WHERE f1 = 'a'::text
 -    "concur_index5" btree (f2) WHERE f1 = 'x'::text
      "std_index" btree (f2)
  
  --
@@@ -2627,22 -2492,16 +2622,22 @@@ ERROR:  DROP INDEX CONCURRENTLY cannot 
  ROLLBACK;
  -- successes
  DROP INDEX CONCURRENTLY IF EXISTS "concur_index3";
 +NOTICE:  index "concur_index3" does not exist, skipping
 +ERROR:  DROP INDEX CONCURRENTLY cannot run inside a transaction block
  DROP INDEX CONCURRENTLY "concur_index4";
 +ERROR:  index "concur_index4" does not exist
  DROP INDEX CONCURRENTLY "concur_index5";
 +ERROR:  index "concur_index5" does not exist
  DROP INDEX CONCURRENTLY "concur_index1";
 +ERROR:  index "concur_index1" does not exist
  DROP INDEX CONCURRENTLY "concur_heap_expr_idx";
 +ERROR:  index "concur_heap_expr_idx" does not exist
  \d concur_heap
- Table "public.concur_heap"
-  Column | Type | Modifiers 
- --------+------+-----------
-  f1     | text | 
-  f2     | text | 
           Table "public.concur_heap"
+  Column | Type | Collation | Nullable | Default 
+ --------+------+-----------+----------+---------
+  f1     | text |           |          | 
+  f2     | text |           |          | 
  Indexes:
      "std_index" btree (f2)
  
@@@ -3048,14 -2941,54 +3043,56 @@@ RESET enable_indexonlyscan
  --
  explain (costs off)
    select * from tenk1 where (thousand, tenthous) in ((1,1001), (null,null));
 -                      QUERY PLAN                      
 -------------------------------------------------------
 - Index Scan using tenk1_thous_tenthous on tenk1
 -   Index Cond: ((thousand = 1) AND (tenthous = 1001))
 -(2 rows)
 +                         QUERY PLAN                         
 +------------------------------------------------------------
 + Remote Fast Query Execution
 +   Node/s: datanode_1, datanode_2
 +   ->  Index Scan using tenk1_thous_tenthous on tenk1
 +         Index Cond: ((thousand = 1) AND (tenthous = 1001))
 +(4 rows)
  
+ --
+ -- Check matching of boolean index columns to WHERE conditions and sort keys
+ --
+ create temp table boolindex (b bool, i int, unique(b, i), junk float);
+ explain (costs off)
+   select * from boolindex order by b, i limit 10;
+                       QUERY PLAN                       
+ -------------------------------------------------------
+  Limit
+    ->  Index Scan using boolindex_b_i_key on boolindex
+ (2 rows)
+ explain (costs off)
+   select * from boolindex where b order by i limit 10;
+                       QUERY PLAN                       
+ -------------------------------------------------------
+  Limit
+    ->  Index Scan using boolindex_b_i_key on boolindex
+          Index Cond: (b = true)
+          Filter: b
+ (4 rows)
+ explain (costs off)
+   select * from boolindex where b = true order by i desc limit 10;
+                            QUERY PLAN                           
+ ----------------------------------------------------------------
+  Limit
+    ->  Index Scan Backward using boolindex_b_i_key on boolindex
+          Index Cond: (b = true)
+          Filter: b
+ (4 rows)
+ explain (costs off)
+   select * from boolindex where not b order by i limit 10;
+                       QUERY PLAN                       
+ -------------------------------------------------------
+  Limit
+    ->  Index Scan using boolindex_b_i_key on boolindex
+          Index Cond: (b = false)
+          Filter: (NOT b)
+ (4 rows)
  --
  -- REINDEX (VERBOSE)
  --
index 4c60b36f61e55ce9faef16a0f0919d5d765811b1,3f405c94ce8a19b5250f57db7744b96f64a9a98b..595817bf85016540bfee97503500ad5cfb06cfc4
@@@ -66,7 -66,54 +66,54 @@@ SELECT * FROM inhg; /* Two records wit
  (2 rows)
  
  DROP TABLE inhg;
 -CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
+ CREATE TABLE test_like_id_1 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ \d test_like_id_1
+                      Table "public.test_like_id_1"
+  Column |  Type   | Collation | Nullable |           Default            
+ --------+---------+-----------+----------+------------------------------
+  a      | integer |           | not null | generated always as identity
+  b      | text    |           |          | 
+ INSERT INTO test_like_id_1 (b) VALUES ('b1');
+ SELECT * FROM test_like_id_1;
+  a | b  
+ ---+----
+  1 | b1
+ (1 row)
+ CREATE TABLE test_like_id_2 (LIKE test_like_id_1);
+ \d test_like_id_2
+            Table "public.test_like_id_2"
+  Column |  Type   | Collation | Nullable | Default 
+ --------+---------+-----------+----------+---------
+  a      | integer |           | not null | 
+  b      | text    |           |          | 
+ INSERT INTO test_like_id_2 (b) VALUES ('b2');
+ ERROR:  null value in column "a" violates not-null constraint
+ DETAIL:  Failing row contains (null, b2).
+ SELECT * FROM test_like_id_2;  -- identity was not copied
+  a | b 
+ ---+---
+ (0 rows)
+ CREATE TABLE test_like_id_3 (LIKE test_like_id_1 INCLUDING IDENTITY);
+ \d test_like_id_3
+                      Table "public.test_like_id_3"
+  Column |  Type   | Collation | Nullable |           Default            
+ --------+---------+-----------+----------+------------------------------
+  a      | integer |           | not null | generated always as identity
+  b      | text    |           |          | 
+ INSERT INTO test_like_id_3 (b) VALUES ('b3');
+ SELECT * FROM test_like_id_3;  -- identity was copied and applied
+  a | b  
+ ---+----
+  1 | b3
+ (1 row)
+ DROP TABLE test_like_id_1, test_like_id_2, test_like_id_3;
 +CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text) DISTRIBUTE BY REPLICATION; /* copies indexes */
  INSERT INTO inhg VALUES (5, 10);
  INSERT INTO inhg VALUES (20, 10); -- should fail
  ERROR:  duplicate key value violates unique constraint "inhg_pkey"
Simple merge
Simple merge
index 4be2ba05470cbffa92b4db635c22b31ff02a191c,a96b2a1b07c36bfa0c9888c9e30631134fe68637..2082b8a77ba5a717fe8b811e77e3af7aef1efea4
@@@ -291,34 -260,69 +291,32 @@@ explain (costs off
       union all
       select ff + 4 as x from ec1) as ss2
    where ss1.x = ec1.f1 and ss1.x = ss2.x and ec1.ff = 42::int8;
 -                             QUERY PLAN                              
 ----------------------------------------------------------------------
 +                                 QUERY PLAN                                  
 +-----------------------------------------------------------------------------
   Nested Loop
 -   ->  Nested Loop
 -         ->  Index Scan using ec1_pkey on ec1
 -               Index Cond: (ff = '42'::bigint)
 +   Join Filter: (x = x)
 +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
           ->  Append
 -               ->  Index Scan using ec1_expr2 on ec1 ec1_1
 -                     Index Cond: (((ff + 2) + 1) = ec1.f1)
 -               ->  Index Scan using ec1_expr3 on ec1 ec1_2
 -                     Index Cond: (((ff + 3) + 1) = ec1.f1)
 -               ->  Index Scan using ec1_expr4 on ec1 ec1_3
 -                     Index Cond: ((ff + 4) = ec1.f1)
 -   ->  Append
 -         ->  Index Scan using ec1_expr2 on ec1 ec1_4
 -               Index Cond: (((ff + 2) + 1) = (((ec1_1.ff + 2) + 1)))
 -         ->  Index Scan using ec1_expr3 on ec1 ec1_5
 -               Index Cond: (((ff + 3) + 1) = (((ec1_1.ff + 2) + 1)))
 -         ->  Index Scan using ec1_expr4 on ec1 ec1_6
 -               Index Cond: ((ff + 4) = (((ec1_1.ff + 2) + 1)))
 -(18 rows)
 -
 --- let's try that as a mergejoin
 -set enable_mergejoin = on;
 -set enable_nestloop = off;
 -explain (costs off)
 -  select * from ec1,
 -    (select ff + 1 as x from
 -       (select ff + 2 as ff from ec1
 -        union all
 -        select ff + 3 as ff from ec1) ss0
 -     union all
 -     select ff + 4 as x from ec1) as ss1,
 -    (select ff + 1 as x from
 -       (select ff + 2 as ff from ec1
 -        union all
 -        select ff + 3 as ff from ec1) ss0
 -     union all
 -     select ff + 4 as x from ec1) as ss2
 -  where ss1.x = ec1.f1 and ss1.x = ss2.x and ec1.ff = 42::int8;
 -                           QUERY PLAN                            
 ------------------------------------------------------------------
 - Merge Join
 -   Merge Cond: ((((ec1_4.ff + 2) + 1)) = (((ec1_1.ff + 2) + 1)))
 -   ->  Merge Append
 -         Sort Key: (((ec1_4.ff + 2) + 1))
 -         ->  Index Scan using ec1_expr2 on ec1 ec1_4
 -         ->  Index Scan using ec1_expr3 on ec1 ec1_5
 -         ->  Index Scan using ec1_expr4 on ec1 ec1_6
 +               ->  Seq Scan on ec1 ec1_4
 +               ->  Seq Scan on ec1 ec1_5
 +               ->  Seq Scan on ec1 ec1_6
     ->  Materialize
 -         ->  Merge Join
 -               Merge Cond: ((((ec1_1.ff + 2) + 1)) = ec1.f1)
 -               ->  Merge Append
 -                     Sort Key: (((ec1_1.ff + 2) + 1))
 -                     ->  Index Scan using ec1_expr2 on ec1 ec1_1
 -                     ->  Index Scan using ec1_expr3 on ec1 ec1_2
 -                     ->  Index Scan using ec1_expr4 on ec1 ec1_3
 -               ->  Sort
 -                     Sort Key: ec1.f1 USING <
 +         ->  Nested Loop
 +               Join Filter: (x = ec1.f1)
 +               ->  Remote Subquery Scan on all (datanode_1)
                       ->  Index Scan using ec1_pkey on ec1
                             Index Cond: (ff = '42'::bigint)
 +               ->  Materialize
 +                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
 +                           ->  Append
 +                                 ->  Seq Scan on ec1 ec1_1
 +                                 ->  Seq Scan on ec1 ec1_2
 +                                 ->  Seq Scan on ec1 ec1_3
  (19 rows)
  
- -- excluding as XL does not support complex queries
- -- with 'union all'
 +-- let's try that as a mergejoin
 +set enable_mergejoin = on;
 +set enable_nestloop = off;
  -- check partially indexed scan
  set enable_nestloop = on;
  set enable_mergejoin = off;
index f025a8bb9152f4e251abfeba8a78165a3510c47c,906dcb8b319bad03f85563c580a087d1bd71f6eb..085eb207b91a2faf26960dc463bc435d03dab485
@@@ -80,13 -78,8 +80,10 @@@ LINE 2:    execute procedure test_event
  create event trigger regress_event_trigger2 on ddl_command_start
     when tag in ('create table', 'CREATE FUNCTION')
     execute procedure test_event_trigger();
 +ERROR:  EVENT TRIGGER not yet supported in Postgres-XL
  -- OK
  comment on event trigger regress_event_trigger is 'test comment';
- -- should fail, event triggers are not schema objects
- comment on event trigger wrong.regress_event_trigger is 'test comment';
- ERROR:  event trigger name cannot be qualified
 +ERROR:  event trigger "regress_event_trigger" does not exist
  -- drop as non-superuser should fail
  create role regress_evt_user;
  set role regress_evt_user;
index c668e8e5582f99a2e9309b5bb0afe2230886b2ae,699309b3b2db0dcf2e14ed6570a92e86ed681050..de22510740617ded55a5b047df598f6cd02f7712
@@@ -186,67 -194,80 +186,71 @@@ ERROR:  foreign-data wrapper "nonexiste
  DROP FOREIGN DATA WRAPPER IF EXISTS nonexistent;
  NOTICE:  foreign-data wrapper "nonexistent" does not exist, skipping
  \dew+
 -                                                        List of foreign-data wrappers
 -    Name    |           Owner           | Handler |        Validator         | Access privileges |         FDW Options          | Description 
 -------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
 - dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
 - foo        | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
 - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                              | 
 -(3 rows)
 +                           List of foreign-data wrappers
 + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
 +------+-------+---------+-----------+-------------------+-------------+-------------
 +(0 rows)
  
  DROP ROLE regress_test_role_super;                          -- ERROR
 -ERROR:  role "regress_test_role_super" cannot be dropped because some objects depend on it
 -DETAIL:  owner of foreign-data wrapper foo
  SET ROLE regress_test_role_super;
 +ERROR:  role "regress_test_role_super" does not exist
  DROP FOREIGN DATA WRAPPER foo;
 +ERROR:  foreign-data wrapper "foo" does not exist
  RESET ROLE;
  DROP ROLE regress_test_role_super;
 +ERROR:  role "regress_test_role_super" does not exist
  \dew+
 -                                                List of foreign-data wrappers
 -    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW Options | Description 
 -------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
 - dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
 - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
 -(2 rows)
 +                           List of foreign-data wrappers
 + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
 +------+-------+---------+-----------+-------------------+-------------+-------------
 +(0 rows)
  
  CREATE FOREIGN DATA WRAPPER foo;
 +ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
 +DETAIL:  The feature is not currently supported
  CREATE SERVER s1 FOREIGN DATA WRAPPER foo;
 +ERROR:  Postgres-XL does not support SERVER yet
 +DETAIL:  The feature is not currently supported
  COMMENT ON SERVER s1 IS 'foreign server';
 +ERROR:  server "s1" does not exist
  CREATE USER MAPPING FOR current_user SERVER s1;
 +ERROR:  Postgres-XL does not support USER MAPPING yet
 +DETAIL:  The feature is not currently supported
+ CREATE USER MAPPING FOR current_user SERVER s1;                               -- ERROR
+ ERROR:  user mapping for "regress_foreign_data_user" already exists for server s1
+ CREATE USER MAPPING IF NOT EXISTS FOR current_user SERVER s1; -- NOTICE
+ NOTICE:  user mapping for "regress_foreign_data_user" already exists for server s1, skipping
  \dew+
 -                                                List of foreign-data wrappers
 -    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW Options | Description 
 -------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
 - dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
 - foo        | regress_foreign_data_user | -       | -                        |                   |             | 
 - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
 -(3 rows)
 +                           List of foreign-data wrappers
 + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
 +------+-------+---------+-----------+-------------------+-------------+-------------
 +(0 rows)
  
  \des+
 -                                                   List of foreign servers
 - Name |           Owner           | Foreign-data wrapper | Access privileges | Type | Version | FDW Options |  Description   
 -------+---------------------------+----------------------+-------------------+------+---------+-------------+----------------
 - s1   | regress_foreign_data_user | foo                  |                   |      |         |             | foreign server
 -(1 row)
 +                                       List of foreign servers
 + Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW Options | Description 
 +------+-------+----------------------+-------------------+------+---------+-------------+-------------
 +(0 rows)
  
  \deu+
 -              List of user mappings
 - Server |         User name         | FDW Options 
 ---------+---------------------------+-------------
 - s1     | regress_foreign_data_user | 
 -(1 row)
 +      List of user mappings
 + Server | User name | FDW Options 
 +--------+-----------+-------------
 +(0 rows)
  
  DROP FOREIGN DATA WRAPPER foo;                              -- ERROR
 -ERROR:  cannot drop foreign-data wrapper foo because other objects depend on it
 -DETAIL:  server s1 depends on foreign-data wrapper foo
 -user mapping for regress_foreign_data_user on server s1 depends on server s1
 -HINT:  Use DROP ... CASCADE to drop the dependent objects too.
 +ERROR:  foreign-data wrapper "foo" does not exist
  SET ROLE regress_test_role;
  DROP FOREIGN DATA WRAPPER foo CASCADE;                      -- ERROR
 -ERROR:  must be owner of foreign-data wrapper foo
 +ERROR:  foreign-data wrapper "foo" does not exist
  RESET ROLE;
  DROP FOREIGN DATA WRAPPER foo CASCADE;
 -NOTICE:  drop cascades to 2 other objects
 -DETAIL:  drop cascades to server s1
 -drop cascades to user mapping for regress_foreign_data_user on server s1
 +ERROR:  foreign-data wrapper "foo" does not exist
  \dew+
 -                                                List of foreign-data wrappers
 -    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW Options | Description 
 -------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
 - dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
 - postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
 -(2 rows)
 +                           List of foreign-data wrappers
 + Name | Owner | Handler | Validator | Access privileges | FDW Options | Description 
 +------+-------+---------+-----------+-------------------+-------------+-------------
 +(0 rows)
  
  \des+
                                         List of foreign servers
@@@ -996,69 -1153,108 +1000,124 @@@ ALTER FOREIGN DATA WRAPPER foo OPTIONS 
  ERROR:  permission denied to alter foreign-data wrapper "foo"
  HINT:  Must be superuser to alter a foreign-data wrapper.
  DROP FOREIGN DATA WRAPPER foo;                                  -- ERROR
 -ERROR:  must be owner of foreign-data wrapper foo
 +ERROR:  foreign-data wrapper "foo" does not exist
  GRANT USAGE ON FOREIGN DATA WRAPPER postgresql TO regress_test_role; -- WARNING
 -WARNING:  no privileges were granted for "postgresql"
 +ERROR:  foreign-data wrapper "postgresql" does not exist
  GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;
 +ERROR:  foreign-data wrapper "foo" does not exist
  CREATE SERVER s9 FOREIGN DATA WRAPPER postgresql;
 +ERROR:  Postgres-XL does not support SERVER yet
 +DETAIL:  The feature is not currently supported
  ALTER SERVER s6 VERSION '0.5';                                  -- ERROR
 -ERROR:  must be owner of foreign server s6
 +ERROR:  server "s6" does not exist
  DROP SERVER s6;                                                 -- ERROR
 -ERROR:  must be owner of foreign server s6
 +ERROR:  server "s6" does not exist
  GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role;          -- ERROR
 -ERROR:  permission denied for foreign server s6
 +ERROR:  server "s6" does not exist
  GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
 +ERROR:  server "s9" does not exist
  CREATE USER MAPPING FOR public SERVER s6;                       -- ERROR
 -ERROR:  must be owner of foreign server s6
 +ERROR:  Postgres-XL does not support USER MAPPING yet
 +DETAIL:  The feature is not currently supported
  CREATE USER MAPPING FOR public SERVER s9;
 +ERROR:  Postgres-XL does not support USER MAPPING yet
 +DETAIL:  The feature is not currently supported
  ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (gotcha 'true'); -- ERROR
 -ERROR:  must be owner of foreign server s6
 +ERROR:  server "s6" does not exist
  DROP USER MAPPING FOR regress_test_role SERVER s6;              -- ERROR
 -ERROR:  must be owner of foreign server s6
 +ERROR:  server "s6" does not exist
  RESET ROLE;
  REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role; -- ERROR
 -ERROR:  dependent privileges exist
 -HINT:  Use CASCADE to revoke them too.
 +ERROR:  foreign-data wrapper "foo" does not exist
  REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role CASCADE;
 +ERROR:  foreign-data wrapper "foo" does not exist
  SET ROLE regress_unprivileged_role;
  GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;   -- ERROR
 -ERROR:  permission denied for foreign-data wrapper foo
 +ERROR:  foreign-data wrapper "foo" does not exist
  CREATE SERVER s10 FOREIGN DATA WRAPPER foo;                     -- ERROR
 -ERROR:  permission denied for foreign-data wrapper foo
 +ERROR:  Postgres-XL does not support SERVER yet
 +DETAIL:  The feature is not currently supported
  ALTER SERVER s9 VERSION '1.1';
 +ERROR:  server "s9" does not exist
  GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
 +ERROR:  server "s9" does not exist
  CREATE USER MAPPING FOR current_user SERVER s9;
 +ERROR:  Postgres-XL does not support USER MAPPING yet
 +DETAIL:  The feature is not currently supported
  DROP SERVER s9 CASCADE;
 -NOTICE:  drop cascades to 2 other objects
 -DETAIL:  drop cascades to user mapping for public on server s9
 -drop cascades to user mapping for regress_unprivileged_role on server s9
 +ERROR:  server "s9" does not exist
  RESET ROLE;
  CREATE SERVER s9 FOREIGN DATA WRAPPER foo;
 +ERROR:  Postgres-XL does not support SERVER yet
 +DETAIL:  The feature is not currently supported
  GRANT USAGE ON FOREIGN SERVER s9 TO regress_unprivileged_role;
 +ERROR:  server "s9" does not exist
  SET ROLE regress_unprivileged_role;
  ALTER SERVER s9 VERSION '1.2';                                  -- ERROR
 -ERROR:  must be owner of foreign server s9
 +ERROR:  server "s9" does not exist
  GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;          -- WARNING
 -WARNING:  no privileges were granted for "s9"
 +ERROR:  server "s9" does not exist
  CREATE USER MAPPING FOR current_user SERVER s9;
 +ERROR:  Postgres-XL does not support USER MAPPING yet
 +DETAIL:  The feature is not currently supported
  DROP SERVER s9 CASCADE;                                         -- ERROR
 +ERROR:  server "s9" does not exist
+ ERROR:  must be owner of foreign server s9
+ -- Check visibility of user mapping data
+ SET ROLE regress_test_role;
+ CREATE SERVER s10 FOREIGN DATA WRAPPER foo;
+ CREATE USER MAPPING FOR public SERVER s10 OPTIONS (user 'secret');
+ GRANT USAGE ON FOREIGN SERVER s10 TO regress_unprivileged_role;
+ -- owner of server can see option fields
+ \deu+
+                  List of user mappings
+  Server |         User name         |    FDW Options    
+ --------+---------------------------+-------------------
+  s10    | public                    | ("user" 'secret')
+  s4     | regress_foreign_data_user | 
+  s5     | regress_test_role         | (modified '1')
+  s6     | regress_test_role         | 
+  s8     | public                    | 
+  s8     | regress_foreign_data_user | 
+  s9     | regress_unprivileged_role | 
+  t1     | public                    | (modified '1')
+ (8 rows)
+ RESET ROLE;
+ -- superuser can see option fields
+ \deu+
+                   List of user mappings
+  Server |         User name         |     FDW Options     
+ --------+---------------------------+---------------------
+  s10    | public                    | ("user" 'secret')
+  s4     | regress_foreign_data_user | 
+  s5     | regress_test_role         | (modified '1')
+  s6     | regress_test_role         | 
+  s8     | public                    | 
+  s8     | regress_foreign_data_user | (password 'public')
+  s9     | regress_unprivileged_role | 
+  t1     | public                    | (modified '1')
+ (8 rows)
+ -- unprivileged user cannot see option fields
+ SET ROLE regress_unprivileged_role;
+ \deu+
+               List of user mappings
+  Server |         User name         | FDW Options 
+ --------+---------------------------+-------------
+  s10    | public                    | 
+  s4     | regress_foreign_data_user | 
+  s5     | regress_test_role         | 
+  s6     | regress_test_role         | 
+  s8     | public                    | 
+  s8     | regress_foreign_data_user | 
+  s9     | regress_unprivileged_role | 
+  t1     | public                    | 
+ (8 rows)
  RESET ROLE;
+ DROP SERVER s10 CASCADE;
+ NOTICE:  drop cascades to user mapping for public on server s10
  -- Triggers
  CREATE FUNCTION dummy_trigger() RETURNS TRIGGER AS $$
    BEGIN
@@@ -1075,8 -1269,13 +1134,15 @@@ CREATE TRIGGER trigtest_after_stmt AFTE
  ON foreign_schema.foreign_table_1
  FOR EACH STATEMENT
  EXECUTE PROCEDURE dummy_trigger();
 +ERROR:  Postgres-XL does not support TRIGGER yet
 +DETAIL:  The feature is not currently supported
+ CREATE TRIGGER trigtest_after_stmt_tt AFTER INSERT OR UPDATE OR DELETE -- ERROR
+ ON foreign_schema.foreign_table_1
+ REFERENCING NEW TABLE AS new_table
+ FOR EACH STATEMENT
+ EXECUTE PROCEDURE dummy_trigger();
+ ERROR:  "foreign_table_1" is a foreign table
+ DETAIL:  Triggers on foreign tables cannot have transition tables.
  CREATE TRIGGER trigtest_before_row BEFORE INSERT OR UPDATE OR DELETE
  ON foreign_schema.foreign_table_1
  FOR EACH ROW
@@@ -1118,52 -1307,72 +1184,62 @@@ CREATE TABLE pt1 
  );
  CREATE FOREIGN TABLE ft2 () INHERITS (pt1)
    SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
 +ERROR:  server "s0" does not exist
  \d+ pt1
 -                                    Table "public.pt1"
 - Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+----------+--------------+-------------
 - c1     | integer |           | not null |         | plain    |              | 
 - c2     | text    |           |          |         | extended |              | 
 - c3     | date    |           |          |         | plain    |              | 
 -Child tables: ft2
 +                          Table "public.pt1"
 + Column |  Type   | Modifiers | Storage  | Stats target | Description 
 +--------+---------+-----------+----------+--------------+-------------
 + c1     | integer | not null  | plain    |              | 
 + c2     | text    |           | extended |              | 
 + c3     | date    |           | plain    |              | 
 +Distribute By: HASH(c1)
 +Location Nodes: ALL DATANODES
  
  \d+ ft2
 -                                       Foreign table "public.ft2"
 - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
 - c1     | integer |           | not null |         |             | plain    |              | 
 - c2     | text    |           |          |         |             | extended |              | 
 - c3     | date    |           |          |         |             | plain    |              | 
 -Server: s0
 -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
 -Inherits: pt1
 -
  DROP FOREIGN TABLE ft2;
 +ERROR:  foreign table "ft2" does not exist
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    |              | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
 +Distribute By: HASH(c1)
 +Location Nodes: ALL DATANODES
  
  CREATE FOREIGN TABLE ft2 (
        c1 integer NOT NULL,
        c2 text,
        c3 date
  ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
 +ERROR:  server "s0" does not exist
  \d+ ft2
 -                                       Foreign table "public.ft2"
 - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
 - c1     | integer |           | not null |         |             | plain    |              | 
 - c2     | text    |           |          |         |             | extended |              | 
 - c3     | date    |           |          |         |             | plain    |              | 
 -Server: s0
 -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
 -
  ALTER FOREIGN TABLE ft2 INHERIT pt1;
 +ERROR:  relation "ft2" does not exist
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    |              | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
 -Child tables: ft2
 +Distribute By: HASH(c1)
 +Location Nodes: ALL DATANODES
  
  \d+ ft2
+                                        Foreign table "public.ft2"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ Inherits: pt1
  CREATE TABLE ct3() INHERITS(ft2);
 +ERROR:  relation "ft2" does not exist
  CREATE FOREIGN TABLE ft3 (
        c1 integer NOT NULL,
        c2 text,
@@@ -1281,22 -1610,19 +1357,22 @@@ CREATE FOREIGN TABLE ft2 
        c2 text,
        c3 date
  ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
 +ERROR:  server "s0" does not exist
  -- child must have parent's INHERIT constraints
  ALTER FOREIGN TABLE ft2 INHERIT pt1;                            -- ERROR
 -ERROR:  child table is missing constraint "pt1chk2"
 +ERROR:  relation "ft2" does not exist
  ALTER FOREIGN TABLE ft2 ADD CONSTRAINT pt1chk2 CHECK (c2 <> '');
 +ERROR:  relation "ft2" does not exist
  ALTER FOREIGN TABLE ft2 INHERIT pt1;
 +ERROR:  relation "ft2" does not exist
  -- child does not inherit NO INHERIT constraints
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
  Check constraints:
      "pt1chk1" CHECK (c1 > 0) NO INHERIT
      "pt1chk2" CHECK (c2 <> ''::text)
@@@ -1311,65 -1648,114 +1387,65 @@@ ALTER TABLE pt1 DROP CONSTRAINT pt1chk
  INSERT INTO pt1 VALUES (1, 'pt1'::text, '1994-01-01'::date);
  ALTER TABLE pt1 ADD CONSTRAINT pt1chk3 CHECK (c2 <> '') NOT VALID;
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
  Check constraints:
      "pt1chk3" CHECK (c2 <> ''::text) NOT VALID
 -Child tables: ft2
 +Distribute By: HASH(c1)
 +Location Nodes: ALL DATANODES
  
  \d+ ft2
 -                                       Foreign table "public.ft2"
 - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
 - c1     | integer |           | not null |         |             | plain    |              | 
 - c2     | text    |           |          |         |             | extended |              | 
 - c3     | date    |           |          |         |             | plain    |              | 
 -Check constraints:
 -    "pt1chk2" CHECK (c2 <> ''::text)
 -    "pt1chk3" CHECK (c2 <> ''::text) NOT VALID
 -Server: s0
 -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
 -Inherits: pt1
 -
  -- VALIDATE CONSTRAINT need do nothing on foreign tables
  ALTER TABLE pt1 VALIDATE CONSTRAINT pt1chk3;
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
  Check constraints:
      "pt1chk3" CHECK (c2 <> ''::text)
 -Child tables: ft2
 +Distribute By: HASH(c1)
 +Location Nodes: ALL DATANODES
  
  \d+ ft2
 -                                       Foreign table "public.ft2"
 - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
 - c1     | integer |           | not null |         |             | plain    |              | 
 - c2     | text    |           |          |         |             | extended |              | 
 - c3     | date    |           |          |         |             | plain    |              | 
 -Check constraints:
 -    "pt1chk2" CHECK (c2 <> ''::text)
 -    "pt1chk3" CHECK (c2 <> ''::text)
 -Server: s0
 -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
 -Inherits: pt1
 -
  -- OID system column
  ALTER TABLE pt1 SET WITH OIDS;
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
  Check constraints:
      "pt1chk3" CHECK (c2 <> ''::text)
 -Child tables: ft2
  Has OIDs: yes
 +Distribute By: HASH(c1)
 +Location Nodes: ALL DATANODES
  
  \d+ ft2
 -                                       Foreign table "public.ft2"
 - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
 - c1     | integer |           | not null |         |             | plain    |              | 
 - c2     | text    |           |          |         |             | extended |              | 
 - c3     | date    |           |          |         |             | plain    |              | 
 -Check constraints:
 -    "pt1chk2" CHECK (c2 <> ''::text)
 -    "pt1chk3" CHECK (c2 <> ''::text)
 -Server: s0
 -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
 -Inherits: pt1
 -Has OIDs: yes
 -
  ALTER TABLE ft2 SET WITHOUT OIDS;  -- ERROR
 -ERROR:  cannot drop inherited column "oid"
 +ERROR:  relation "ft2" does not exist
  ALTER TABLE pt1 SET WITHOUT OIDS;
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  c1     | integer | not null  | plain    | 10000        | 
-  c2     | text    |           | extended |              | 
-  c3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    | 10000        | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
  Check constraints:
      "pt1chk3" CHECK (c2 <> ''::text)
 -Child tables: ft2
 +Distribute By: HASH(c1)
 +Location Nodes: ALL DATANODES
  
  \d+ ft2
 -                                       Foreign table "public.ft2"
 - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
 - c1     | integer |           | not null |         |             | plain    |              | 
 - c2     | text    |           |          |         |             | extended |              | 
 - c3     | date    |           |          |         |             | plain    |              | 
 -Check constraints:
 -    "pt1chk2" CHECK (c2 <> ''::text)
 -    "pt1chk3" CHECK (c2 <> ''::text)
 -Server: s0
 -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
 -Inherits: pt1
 -
  -- changes name of an attribute recursively
  ALTER TABLE pt1 RENAME COLUMN c1 TO f1;
  ALTER TABLE pt1 RENAME COLUMN c2 TO f2;
@@@ -1377,33 -1763,47 +1453,33 @@@ ALTER TABLE pt1 RENAME COLUMN c3 TO f3
  -- changes name of a constraint recursively
  ALTER TABLE pt1 RENAME CONSTRAINT pt1chk3 TO f2_check;
  \d+ pt1
-                           Table "public.pt1"
-  Column |  Type   | Modifiers | Storage  | Stats target | Description 
- --------+---------+-----------+----------+--------------+-------------
-  f1     | integer | not null  | plain    | 10000        | 
-  f2     | text    |           | extended |              | 
-  f3     | date    |           | plain    |              | 
+                                     Table "public.pt1"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  f1     | integer |           | not null |         | plain    | 10000        | 
+  f2     | text    |           |          |         | extended |              | 
+  f3     | date    |           |          |         | plain    |              | 
  Check constraints:
      "f2_check" CHECK (f2 <> ''::text)
 -Child tables: ft2
 +Distribute By: HASH(f1)
 +Location Nodes: ALL DATANODES
  
  \d+ ft2
 -                                       Foreign table "public.ft2"
 - Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
 ---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
 - f1     | integer |           | not null |         |             | plain    |              | 
 - f2     | text    |           |          |         |             | extended |              | 
 - f3     | date    |           |          |         |             | plain    |              | 
 -Check constraints:
 -    "f2_check" CHECK (f2 <> ''::text)
 -    "pt1chk2" CHECK (f2 <> ''::text)
 -Server: s0
 -FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
 -Inherits: pt1
 -
  -- TRUNCATE doesn't work on foreign tables, either directly or recursively
  TRUNCATE ft2;  -- ERROR
 -ERROR:  "ft2" is not a table
 +ERROR:  relation "ft2" does not exist
  TRUNCATE pt1;  -- ERROR
 -ERROR:  "ft2" is not a table
  DROP TABLE pt1 CASCADE;
 -NOTICE:  drop cascades to foreign table ft2
  -- IMPORT FOREIGN SCHEMA
  IMPORT FOREIGN SCHEMA s1 FROM SERVER s9 INTO public; -- ERROR
 -ERROR:  foreign-data wrapper "foo" has no handler
 +ERROR:  server "s9" does not exist
  IMPORT FOREIGN SCHEMA s1 LIMIT TO (t1) FROM SERVER s9 INTO public; --ERROR
 -ERROR:  foreign-data wrapper "foo" has no handler
 +ERROR:  server "s9" does not exist
  IMPORT FOREIGN SCHEMA s1 EXCEPT (t1) FROM SERVER s9 INTO public; -- ERROR
 -ERROR:  foreign-data wrapper "foo" has no handler
 +ERROR:  server "s9" does not exist
  IMPORT FOREIGN SCHEMA s1 EXCEPT (t1, t2) FROM SERVER s9 INTO public
  OPTIONS (option1 'value1', option2 'value2'); -- ERROR
 -ERROR:  foreign-data wrapper "foo" has no handler
 +ERROR:  server "s9" does not exist
  -- DROP FOREIGN TABLE
  DROP FOREIGN TABLE no_table;                                    -- ERROR
  ERROR:  foreign table "no_table" does not exist
@@@ -1414,7 -1813,209 +1490,206 @@@ ERROR:  foreign table "foreign_table_1
  -- REASSIGN OWNED/DROP OWNED of foreign objects
  REASSIGN OWNED BY regress_test_role TO regress_test_role2;
  DROP OWNED BY regress_test_role2;
 -ERROR:  cannot drop desired object(s) because other objects depend on them
 -DETAIL:  user mapping for regress_test_role on server s5 depends on server s5
 -HINT:  Use DROP ... CASCADE to drop the dependent objects too.
  DROP OWNED BY regress_test_role2 CASCADE;
+ NOTICE:  drop cascades to user mapping for regress_test_role on server s5
+ -- Foreign partition DDL stuff
+ CREATE TABLE pt2 (
+       c1 integer NOT NULL,
+       c2 text,
+       c3 date
+ ) PARTITION BY LIST (c1);
+ CREATE FOREIGN TABLE pt2_1 PARTITION OF pt2 FOR VALUES IN (1)
+   SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ -- partition cannot have additional columns
+ DROP FOREIGN TABLE pt2_1;
+ CREATE FOREIGN TABLE pt2_1 (
+       c1 integer NOT NULL,
+       c2 text,
+       c3 date,
+       c4 char
+ ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2_1
+                                          Foreign table "public.pt2_1"
+  Column |     Type     | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+--------------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer      |           | not null |         |             | plain    |              | 
+  c2     | text         |           |          |         |             | extended |              | 
+  c3     | date         |           |          |         |             | plain    |              | 
+  c4     | character(1) |           |          |         |             | extended |              | 
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
+ ERROR:  table "pt2_1" contains column "c4" not found in parent "pt2"
+ DETAIL:  New partition should contain only the columns present in parent.
+ DROP FOREIGN TABLE pt2_1;
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ CREATE FOREIGN TABLE pt2_1 (
+       c1 integer NOT NULL,
+       c2 text,
+       c3 date
+ ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ -- no attach partition validation occurs for foreign tables
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           |          |         |             | plain    |              | 
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ -- cannot add column to a partition
+ ALTER TABLE pt2_1 ADD c4 char;
+ ERROR:  cannot add column to a partition
+ -- ok to have a partition's own constraints though
+ ALTER TABLE pt2_1 ALTER c3 SET NOT NULL;
+ ALTER TABLE pt2_1 ADD CONSTRAINT p21chk CHECK (c2 <> '');
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           |          |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           | not null |         |             | plain    |              | 
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Check constraints:
+     "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ -- cannot drop inherited NOT NULL constraint from a partition
+ ALTER TABLE pt2_1 ALTER c1 DROP NOT NULL;
+ ERROR:  column "c1" is marked NOT NULL in parent table
+ -- partition must have parent's constraints
+ ALTER TABLE pt2 DETACH PARTITION pt2_1;
+ ALTER TABLE pt2 ALTER c2 SET NOT NULL;
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           | not null |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           |          |         |             | extended |              | 
+  c3     | date    |           | not null |         |             | plain    |              | 
+ Check constraints:
+     "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
+ ERROR:  column "c2" in child table must be marked NOT NULL
+ ALTER FOREIGN TABLE pt2_1 ALTER c2 SET NOT NULL;
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ ALTER TABLE pt2 DETACH PARTITION pt2_1;
+ ALTER TABLE pt2 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
+ \d+ pt2
+                                     Table "public.pt2"
+  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+  c1     | integer |           | not null |         | plain    |              | 
+  c2     | text    |           | not null |         | extended |              | 
+  c3     | date    |           |          |         | plain    |              | 
+ Partition key: LIST (c1)
+ Check constraints:
+     "pt2chk1" CHECK (c1 > 0)
+ \d+ pt2_1
+                                       Foreign table "public.pt2_1"
+  Column |  Type   | Collation | Nullable | Default | FDW Options | Storage  | Stats target | Description 
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+  c1     | integer |           | not null |         |             | plain    |              | 
+  c2     | text    |           | not null |         |             | extended |              | 
+  c3     | date    |           | not null |         |             | plain    |              | 
+ Check constraints:
+     "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
+ ERROR:  child table is missing constraint "pt2chk1"
+ ALTER FOREIGN TABLE pt2_1 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ -- TRUNCATE doesn't work on foreign tables, either directly or recursively
+ TRUNCATE pt2_1;  -- ERROR
+ ERROR:  "pt2_1" is not a table
+ TRUNCATE pt2;  -- ERROR
+ ERROR:  "pt2_1" is not a table
+ DROP FOREIGN TABLE pt2_1;
+ DROP TABLE pt2;
  -- Cleanup
  DROP SCHEMA foreign_schema CASCADE;
  DROP ROLE regress_test_role;                                -- ERROR
index acb5eeb802477aa5aee3a190b49d044c4f71ebd6,fef072eddfa5e3870197a091cc415a2e9d573791..0559b0380b0270c2e6a05ffc6e180373d87300e8
@@@ -1369,22 -1363,55 +1369,56 @@@ drop table pp, cc
  -- Test interaction of foreign-key optimization with rules (bug #14219)
  --
  create temp table t1 (a integer primary key, b text);
 -create temp table t2 (a integer primary key, b integer references t1);
 +create temp table t2 (a integer, b integer references t1) distribute by hash (b);
  create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a;
  explain (costs off) delete from t1 where a = 1;
 +ERROR:  could not plan this distributed delete
 +DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 +delete from t1 where a = 1;
 +ERROR:  could not plan this distributed delete
 +DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 +drop rule r1 on t1;
 +explain (costs off, nodes off) delete from t1 where a = 1;
                   QUERY PLAN                 
  --------------------------------------------
 - Delete on t2
 -   ->  Nested Loop
 + Remote Fast Query Execution
 +   ->  Delete on t1
           ->  Index Scan using t1_pkey on t1
                 Index Cond: (a = 1)
 -         ->  Seq Scan on t2
 -               Filter: (b = 1)
 - 
 - Delete on t1
 -   ->  Index Scan using t1_pkey on t1
 -         Index Cond: (a = 1)
 -(10 rows)
 +(4 rows)
  
  delete from t1 where a = 1;
+ --
+ -- Test deferred FK check on a tuple deleted by a rolled-back subtransaction
+ --
+ create table pktable2(f1 int primary key);
+ create table fktable2(f1 int references pktable2 deferrable initially deferred);
+ insert into pktable2 values(1);
+ begin;
+ insert into fktable2 values(1);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit;
+ begin;
+ insert into fktable2 values(2);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit; -- fail
+ ERROR:  insert or update on table "fktable2" violates foreign key constraint "fktable2_f1_fkey"
+ DETAIL:  Key (f1)=(2) is not present in table "pktable2".
+ --
+ -- Test that we prevent dropping FK constraint with pending trigger events
+ --
+ begin;
+ insert into fktable2 values(2);
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ ERROR:  cannot ALTER TABLE "fktable2" because it has pending trigger events
+ commit;
+ begin;
+ delete from pktable2 where f1 = 1;
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ ERROR:  cannot ALTER TABLE "pktable2" because it has pending trigger events
+ commit;
+ drop table pktable2, fktable2;
Simple merge
index efd00c97503064c452ce5d15e6d7f45bc329b440,fd618afe603a588fd4a81a4a3eea88bed25cd565..8cfdb1fba355e063f212219ef880677766573e73
@@@ -375,11 -383,11 +383,11 @@@ select 
  ERROR:  aggregate functions are not allowed in FROM clause of their own query level
  LINE 3:        lateral (select a, b, sum(v.x) from gstest_data(v.x) ...
                                       ^
- -- min max optimisation should still work with GROUP BY ()
+ -- min max optimization should still work with GROUP BY ()
  explain (costs off)
    select min(unique1) from tenk1 GROUP BY ();
 -                         QUERY PLAN                         
 -------------------------------------------------------------
 +                               QUERY PLAN                               
 +------------------------------------------------------------------------
   Result
     InitPlan 1 (returns $0)
       ->  Limit
index db608fe4f8538540fba165cdc5b0fbfe674c8e17,3e255fbded8de20b646e38dc033a96ba1e91d4a5..2958a0a579faca95437e74c674c4338455120b24
@@@ -18,12 -18,12 +18,12 @@@ UPDATE toasttest SET cnt = cnt +1 RETUR
                                                                                                  substring                                                                                                 
  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   (two-compressed,1,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
 + ("one-toasted,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
   (two-toasted,1,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345
   ("one-compressed,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
 - ("one-toasted,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
  (4 rows)
  
- -- modification without modifying asigned value
+ -- modification without modifying assigned value
  UPDATE toasttest SET cnt = cnt +1, f1 = f1 RETURNING substring(toasttest::text, 1, 200);
                                                                                                  substring                                                                                                 
  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@@@ -56,12 -56,12 +56,12 @@@ SELECT substring(toasttest::text, 1, 20
                                                                                                  substring                                                                                                 
  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   (two-compressed,4,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
 + ("one-toasted,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
   (two-toasted,4,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234
   ("one-compressed,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
 - ("one-toasted,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
  (4 rows)
  
- -- check we didn't screw with main/toast tuple visiblity
+ -- check we didn't screw with main/toast tuple visibility
  VACUUM FREEZE toasttest;
  SELECT substring(toasttest::text, 1, 200) FROM toasttest;
                                                                                                  substring                                                                                                 
@@@ -92,12 -90,12 +92,12 @@@ UPDATE toasttest SET cnt = cnt +1 RETUR
                                                                                                  substring                                                                                                 
  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   (two-compressed,5,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
 + ("one-toasted,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
   (two-toasted,5,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234
   ("one-compressed,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
 - ("one-toasted,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
  (4 rows)
  
- -- modification without modifying asigned value
+ -- modification without modifying assigned value
  UPDATE toasttest SET cnt = cnt +1, f1 = f1 RETURNING substring(toasttest::text, 1, 200);
                                                                                                  substring                                                                                                 
  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@@@ -131,13 -129,13 +131,13 @@@ SELECT substring(toasttest::text, 1, 20
                                                                                                  substring                                                                                                 
  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   (two-compressed,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
 - (two-toasted,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
 - ("one-compressed,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
   ("one-toasted,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
   ("one-toasted,one-null, via indirect",0,1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
 + (two-toasted,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
 + ("one-compressed,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
  (5 rows)
  
- -- check we didn't screw with main/toast tuple visiblity
+ -- check we didn't screw with main/toast tuple visibility
  VACUUM FREEZE toasttest;
  SELECT substring(toasttest::text, 1, 200) FROM toasttest;
                                                                                                  substring                                                                                                 
index d3b9bd382fbce3340f1fcbcc59e8d288178f4bff,be9427eb6b8178f7a095d1896207f9f9cb72145c..00fd7201170738ae45f205ba983142103799cb57
@@@ -414,137 -411,285 +414,285 @@@ SELECT i FROM inet_tbl WHERE i << '192.
  
  SET enable_seqscan TO on;
  DROP INDEX inet_idx2;
+ -- check that spgist index works correctly
+ CREATE INDEX inet_idx3 ON inet_tbl using spgist (i);
+ SET enable_seqscan TO off;
+ SELECT * FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/25
+  192.168.1.0/24 | 192.168.1.255/25
+  192.168.1.0/26 | 192.168.1.226
+ (3 rows)
+ SELECT * FROM inet_tbl WHERE i <<= '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/24
+  192.168.1.0/24 | 192.168.1.226/24
+  192.168.1.0/24 | 192.168.1.255/24
+  192.168.1.0/24 | 192.168.1.0/25
+  192.168.1.0/24 | 192.168.1.255/25
+  192.168.1.0/26 | 192.168.1.226
+ (6 rows)
+ SELECT * FROM inet_tbl WHERE i && '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/24
+  192.168.1.0/24 | 192.168.1.226/24
+  192.168.1.0/24 | 192.168.1.255/24
+  192.168.1.0/24 | 192.168.1.0/25
+  192.168.1.0/24 | 192.168.1.255/25
+  192.168.1.0/26 | 192.168.1.226
+ (6 rows)
+ SELECT * FROM inet_tbl WHERE i >>= '192.168.1.0/24'::cidr ORDER BY i;
+        c        |        i         
+ ----------------+------------------
+  192.168.1.0/24 | 192.168.1.0/24
+  192.168.1.0/24 | 192.168.1.226/24
+  192.168.1.0/24 | 192.168.1.255/24
+ (3 rows)
+ SELECT * FROM inet_tbl WHERE i >> '192.168.1.0/24'::cidr ORDER BY i;
+  c | i 
+ ---+---
+ (0 rows)
+ SELECT * FROM inet_tbl WHERE i < '192.168.1.0/24'::cidr ORDER BY i;
+       c      |      i      
+ -------------+-------------
+  10.0.0.0/8  | 9.1.2.3/8
+  10.0.0.0/32 | 10.1.2.3/8
+  10.0.0.0/8  | 10.1.2.3/8
+  10.0.0.0/8  | 10.1.2.3/8
+  10.1.0.0/16 | 10.1.2.3/16
+  10.1.2.0/24 | 10.1.2.3/24
+  10.1.2.3/32 | 10.1.2.3
+  10.0.0.0/8  | 11.1.2.3/8
+ (8 rows)
+ SELECT * FROM inet_tbl WHERE i <= '192.168.1.0/24'::cidr ORDER BY i;
+        c        |       i        
+ ----------------+----------------
+  10.0.0.0/8     | 9.1.2.3/8
+  10.0.0.0/8     | 10.1.2.3/8
+  10.0.0.0/32    | 10.1.2.3/8
+  10.0.0.0/8     | 10.1.2.3/8
+  10.1.0.0/16    | 10.1.2.3/16
+  10.1.2.0/24    | 10.1.2.3/24
+  10.1.2.3/32    | 10.1.2.3
+  10.0.0.0/8     | 11.1.2.3/8
+  192.168.1.0/24 | 192.168.1.0/24
+ (9 rows)
+ SELECT * FROM inet_tbl WHERE i = '192.168.1.0/24'::cidr ORDER BY i;
+        c        |       i        
+ ----------------+----------------
+  192.168.1.0/24 | 192.168.1.0/24
+ (1 row)
+ SELECT * FROM inet_tbl WHERE i >= '192.168.1.0/24'::cidr ORDER BY i;
+          c          |        i         
+ --------------------+------------------
+  192.168.1.0/24     | 192.168.1.0/24
+  192.168.1.0/24     | 192.168.1.226/24
+  192.168.1.0/24     | 192.168.1.255/24
+  192.168.1.0/24     | 192.168.1.0/25
+  192.168.1.0/24     | 192.168.1.255/25
+  192.168.1.0/26     | 192.168.1.226
+  ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+  10:23::f1/128      | 10:23::f1/64
+  10:23::8000/113    | 10:23::ffff
+ (9 rows)
+ SELECT * FROM inet_tbl WHERE i > '192.168.1.0/24'::cidr ORDER BY i;
+          c          |        i         
+ --------------------+------------------
+  192.168.1.0/24     | 192.168.1.226/24
+  192.168.1.0/24     | 192.168.1.255/24
+  192.168.1.0/24     | 192.168.1.0/25
+  192.168.1.0/24     | 192.168.1.255/25
+  192.168.1.0/26     | 192.168.1.226
+  ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+  10:23::f1/128      | 10:23::f1/64
+  10:23::8000/113    | 10:23::ffff
+ (8 rows)
+ SELECT * FROM inet_tbl WHERE i <> '192.168.1.0/24'::cidr ORDER BY i;
+          c          |        i         
+ --------------------+------------------
+  10.0.0.0/8         | 9.1.2.3/8
+  10.0.0.0/8         | 10.1.2.3/8
+  10.0.0.0/32        | 10.1.2.3/8
+  10.0.0.0/8         | 10.1.2.3/8
+  10.1.0.0/16        | 10.1.2.3/16
+  10.1.2.0/24        | 10.1.2.3/24
+  10.1.2.3/32        | 10.1.2.3
+  10.0.0.0/8         | 11.1.2.3/8
+  192.168.1.0/24     | 192.168.1.226/24
+  192.168.1.0/24     | 192.168.1.255/24
+  192.168.1.0/24     | 192.168.1.0/25
+  192.168.1.0/24     | 192.168.1.255/25
+  192.168.1.0/26     | 192.168.1.226
+  ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+  10:23::f1/128      | 10:23::f1/64
+  10:23::8000/113    | 10:23::ffff
+ (16 rows)
+ -- test index-only scans
+ EXPLAIN (COSTS OFF)
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+                     QUERY PLAN                     
+ ---------------------------------------------------
+  Sort
+    Sort Key: i
+    ->  Index Only Scan using inet_idx3 on inet_tbl
+          Index Cond: (i << '192.168.1.0/24'::inet)
+ (4 rows)
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+         i         
+ ------------------
+  192.168.1.0/25
+  192.168.1.255/25
+  192.168.1.226
+ (3 rows)
+ SET enable_seqscan TO on;
+ DROP INDEX inet_idx3;
  -- simple tests of inet boolean and arithmetic operators
 -SELECT i, ~i AS "~i" FROM inet_tbl;
 +SELECT i, ~i AS "~i" FROM inet_tbl ORDER BY i;
          i         |                     ~i                     
  ------------------+--------------------------------------------
 - 192.168.1.226/24 | 63.87.254.29/24
 - 192.168.1.226    | 63.87.254.29
 - 192.168.1.0/24   | 63.87.254.255/24
 - 192.168.1.0/25   | 63.87.254.255/25
 - 192.168.1.255/24 | 63.87.254.0/24
 - 192.168.1.255/25 | 63.87.254.0/25
 + 9.1.2.3/8        | 246.254.253.252/8
   10.1.2.3/8       | 245.254.253.252/8
   10.1.2.3/8       | 245.254.253.252/8
 - 10.1.2.3         | 245.254.253.252
 - 10.1.2.3/24      | 245.254.253.252/24
 - 10.1.2.3/16      | 245.254.253.252/16
   10.1.2.3/8       | 245.254.253.252/8
 + 10.1.2.3/16      | 245.254.253.252/16
 + 10.1.2.3/24      | 245.254.253.252/24
 + 10.1.2.3         | 245.254.253.252
   11.1.2.3/8       | 244.254.253.252/8
 - 9.1.2.3/8        | 246.254.253.252/8
 + 192.168.1.0/24   | 63.87.254.255/24
 + 192.168.1.226/24 | 63.87.254.29/24
 + 192.168.1.255/24 | 63.87.254.0/24
 + 192.168.1.0/25   | 63.87.254.255/25
 + 192.168.1.255/25 | 63.87.254.0/25
 + 192.168.1.226    | 63.87.254.29
 + ::4.3.2.1/24     | ffff:ffff:ffff:ffff:ffff:ffff:fbfc:fdfe/24
   10:23::f1/64     | ffef:ffdc:ffff:ffff:ffff:ffff:ffff:ff0e/64
   10:23::ffff      | ffef:ffdc:ffff:ffff:ffff:ffff:ffff:0
 - ::4.3.2.1/24     | ffff:ffff:ffff:ffff:ffff:ffff:fbfc:fdfe/24
  (17 rows)
  
 -SELECT i, c, i & c AS "and" FROM inet_tbl;
 +SELECT i, c, i & c AS "and" FROM inet_tbl ORDER BY i, c;
          i         |         c          |      and       
  ------------------+--------------------+----------------
 - 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.0/24
 - 192.168.1.226    | 192.168.1.0/26     | 192.168.1.0
 - 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
 - 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
 - 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.0/24
 - 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.0/25
 + 9.1.2.3/8        | 10.0.0.0/8         | 8.0.0.0/8
 + 10.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
   10.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
   10.1.2.3/8       | 10.0.0.0/32        | 10.0.0.0
 - 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
 - 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.0/24
   10.1.2.3/16      | 10.1.0.0/16        | 10.1.0.0/16
 - 10.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
 + 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.0/24
 + 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
   11.1.2.3/8       | 10.0.0.0/8         | 10.0.0.0/8
 - 9.1.2.3/8        | 10.0.0.0/8         | 8.0.0.0/8
 + 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
 + 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.0/24
 + 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.0/24
 + 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
 + 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.0/25
 + 192.168.1.226    | 192.168.1.0/26     | 192.168.1.0
 + ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::0.2.2.0
   10:23::f1/64     | 10:23::f1/128      | 10:23::f1
   10:23::ffff      | 10:23::8000/113    | 10:23::8000
 - ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::0.2.2.0
  (17 rows)
  
 -SELECT i, c, i | c AS "or" FROM inet_tbl;
 +SELECT i, c, i | c AS "or" FROM inet_tbl ORDER BY i, c;
          i         |         c          |        or        
  ------------------+--------------------+------------------
 - 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.226/24
 - 192.168.1.226    | 192.168.1.0/26     | 192.168.1.226
 - 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
 - 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
 - 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.255/24
 - 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.255/25
 + 9.1.2.3/8        | 10.0.0.0/8         | 11.1.2.3/8
 + 10.1.2.3/8       | 10.0.0.0/8         | 10.1.2.3/8
   10.1.2.3/8       | 10.0.0.0/8         | 10.1.2.3/8
   10.1.2.3/8       | 10.0.0.0/32        | 10.1.2.3
 - 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
 - 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.3/24
   10.1.2.3/16      | 10.1.0.0/16        | 10.1.2.3/16
 - 10.1.2.3/8       | 10.0.0.0/8         | 10.1.2.3/8
 + 10.1.2.3/24      | 10.1.2.0/24        | 10.1.2.3/24
 + 10.1.2.3         | 10.1.2.3/32        | 10.1.2.3
   11.1.2.3/8       | 10.0.0.0/8         | 11.1.2.3/8
 - 9.1.2.3/8        | 10.0.0.0/8         | 11.1.2.3/8
 + 192.168.1.0/24   | 192.168.1.0/24     | 192.168.1.0/24
 + 192.168.1.226/24 | 192.168.1.0/24     | 192.168.1.226/24
 + 192.168.1.255/24 | 192.168.1.0/24     | 192.168.1.255/24
 + 192.168.1.0/25   | 192.168.1.0/24     | 192.168.1.0/25
 + 192.168.1.255/25 | 192.168.1.0/24     | 192.168.1.255/25
 + 192.168.1.226    | 192.168.1.0/26     | 192.168.1.226
 + ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::ffff:5.3.3.5
   10:23::f1/64     | 10:23::f1/128      | 10:23::f1
   10:23::ffff      | 10:23::8000/113    | 10:23::ffff
 - ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | ::ffff:5.3.3.5
  (17 rows)
  
 -SELECT i, i + 500 AS "i+500" FROM inet_tbl;
 +SELECT i, i + 500 AS "i+500" FROM inet_tbl ORDER BY i;
          i         |      i+500       
  ------------------+------------------
 - 192.168.1.226/24 | 192.168.3.214/24
 - 192.168.1.226    | 192.168.3.214
 - 192.168.1.0/24   | 192.168.2.244/24
 - 192.168.1.0/25   | 192.168.2.244/25
 - 192.168.1.255/24 | 192.168.3.243/24
 - 192.168.1.255/25 | 192.168.3.243/25
 + 9.1.2.3/8        | 9.1.3.247/8
   10.1.2.3/8       | 10.1.3.247/8
   10.1.2.3/8       | 10.1.3.247/8
 - 10.1.2.3         | 10.1.3.247
 - 10.1.2.3/24      | 10.1.3.247/24
 - 10.1.2.3/16      | 10.1.3.247/16
   10.1.2.3/8       | 10.1.3.247/8
 + 10.1.2.3/16      | 10.1.3.247/16
 + 10.1.2.3/24      | 10.1.3.247/24
 + 10.1.2.3         | 10.1.3.247
   11.1.2.3/8       | 11.1.3.247/8
 - 9.1.2.3/8        | 9.1.3.247/8
 + 192.168.1.0/24   | 192.168.2.244/24
 + 192.168.1.226/24 | 192.168.3.214/24
 + 192.168.1.255/24 | 192.168.3.243/24
 + 192.168.1.0/25   | 192.168.2.244/25
 + 192.168.1.255/25 | 192.168.3.243/25
 + 192.168.1.226    | 192.168.3.214
 + ::4.3.2.1/24     | ::4.3.3.245/24
   10:23::f1/64     | 10:23::2e5/64
   10:23::ffff      | 10:23::1:1f3
 - ::4.3.2.1/24     | ::4.3.3.245/24
  (17 rows)
  
 -SELECT i, i - 500 AS "i-500" FROM inet_tbl;
 +SELECT i, i - 500 AS "i-500" FROM inet_tbl ORDER BY i;
          i         |                 i-500                  
  ------------------+----------------------------------------
 - 192.168.1.226/24 | 192.167.255.238/24
 - 192.168.1.226    | 192.167.255.238
 - 192.168.1.0/24   | 192.167.255.12/24
 - 192.168.1.0/25   | 192.167.255.12/25
 - 192.168.1.255/24 | 192.168.0.11/24
 - 192.168.1.255/25 | 192.168.0.11/25
 + 9.1.2.3/8        | 9.1.0.15/8
   10.1.2.3/8       | 10.1.0.15/8
   10.1.2.3/8       | 10.1.0.15/8
 - 10.1.2.3         | 10.1.0.15
 - 10.1.2.3/24      | 10.1.0.15/24
 - 10.1.2.3/16      | 10.1.0.15/16
   10.1.2.3/8       | 10.1.0.15/8
 + 10.1.2.3/16      | 10.1.0.15/16
 + 10.1.2.3/24      | 10.1.0.15/24
 + 10.1.2.3         | 10.1.0.15
   11.1.2.3/8       | 11.1.0.15/8
 - 9.1.2.3/8        | 9.1.0.15/8
 + 192.168.1.0/24   | 192.167.255.12/24
 + 192.168.1.226/24 | 192.167.255.238/24
 + 192.168.1.255/24 | 192.168.0.11/24
 + 192.168.1.0/25   | 192.167.255.12/25
 + 192.168.1.255/25 | 192.168.0.11/25
 + 192.168.1.226    | 192.167.255.238
 + ::4.3.2.1/24     | ::4.3.0.13/24
   10:23::f1/64     | 10:22:ffff:ffff:ffff:ffff:ffff:fefd/64
   10:23::ffff      | 10:23::fe0b
 - ::4.3.2.1/24     | ::4.3.0.13/24
  (17 rows)
  
 -SELECT i, c, i - c AS "minus" FROM inet_tbl;
 +SELECT i, c, i - c AS "minus" FROM inet_tbl ORDER BY i, c;
          i         |         c          |      minus       
  ------------------+--------------------+------------------
 - 192.168.1.226/24 | 192.168.1.0/24     |              226
 - 192.168.1.226    | 192.168.1.0/26     |              226
 - 192.168.1.0/24   | 192.168.1.0/24     |                0
 - 192.168.1.0/25   | 192.168.1.0/24     |                0
 - 192.168.1.255/24 | 192.168.1.0/24     |              255
 - 192.168.1.255/25 | 192.168.1.0/24     |              255
 + 9.1.2.3/8        | 10.0.0.0/8         |        -16711165
 + 10.1.2.3/8       | 10.0.0.0/8         |            66051
   10.1.2.3/8       | 10.0.0.0/8         |            66051
   10.1.2.3/8       | 10.0.0.0/32        |            66051
 - 10.1.2.3         | 10.1.2.3/32        |                0
 - 10.1.2.3/24      | 10.1.2.0/24        |                3
   10.1.2.3/16      | 10.1.0.0/16        |              515
 - 10.1.2.3/8       | 10.0.0.0/8         |            66051
 + 10.1.2.3/24      | 10.1.2.0/24        |                3
 + 10.1.2.3         | 10.1.2.3/32        |                0
   11.1.2.3/8       | 10.0.0.0/8         |         16843267
 - 9.1.2.3/8        | 10.0.0.0/8         |        -16711165
 + 192.168.1.0/24   | 192.168.1.0/24     |                0
 + 192.168.1.226/24 | 192.168.1.0/24     |              226
 + 192.168.1.255/24 | 192.168.1.0/24     |              255
 + 192.168.1.0/25   | 192.168.1.0/24     |                0
 + 192.168.1.255/25 | 192.168.1.0/24     |              255
 + 192.168.1.226    | 192.168.1.0/26     |              226
 + ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | -281470631346435
   10:23::f1/64     | 10:23::f1/128      |                0
   10:23::ffff      | 10:23::8000/113    |            32767
 - ::4.3.2.1/24     | ::ffff:1.2.3.4/128 | -281470631346435
  (17 rows)
  
  SELECT '127.0.0.1'::inet + 257;
index 9b6a131a6577fa130070e81aa9e76c970822eb88,35d182d599233021900d2f04946d4cb80633b0a2..dd0c3d1eafae1a81bbaa6e191538fbc5f98764ff
@@@ -1135,17 -1001,15 +1193,17 @@@ ALTER TABLE inhts RENAME aa TO aaa
  ERROR:  cannot rename inherited column "aa"
  ALTER TABLE inhts RENAME d TO dd;
  \d+ inhts
-                         Table "public.inhts"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  aa     | integer |           | plain   |              | 
-  b      | integer |           | plain   |              | 
-  c      | integer |           | plain   |              | 
-  dd     | integer |           | plain   |              | 
+                                    Table "public.inhts"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  aa     | integer |           |          |         | plain   |              | 
+  b      | integer |           |          |         | plain   |              | 
+  c      | integer |           |          |         | plain   |              | 
+  dd     | integer |           |          |         | plain   |              | 
  Inherits: inht1,
            inhs1
 +Distribute By: HASH(aa)
 +Location Nodes: ALL DATANODES
  
  DROP TABLE inhts;
  -- Test for renaming in diamond inheritance
@@@ -1156,18 -1020,16 +1214,18 @@@ NOTICE:  merging multiple inherited def
  NOTICE:  merging multiple inherited definitions of column "b"
  ALTER TABLE inht1 RENAME aa TO aaa;
  \d+ inht4
-                         Table "public.inht4"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  aaa    | integer |           | plain   |              | 
-  b      | integer |           | plain   |              | 
-  x      | integer |           | plain   |              | 
-  y      | integer |           | plain   |              | 
-  z      | integer |           | plain   |              | 
+                                    Table "public.inht4"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  aaa    | integer |           |          |         | plain   |              | 
+  b      | integer |           |          |         | plain   |              | 
+  x      | integer |           |          |         | plain   |              | 
+  y      | integer |           |          |         | plain   |              | 
+  z      | integer |           |          |         | plain   |              | 
  Inherits: inht2,
            inht3
 +Distribute By: HASH(aaa)
 +Location Nodes: ALL DATANODES
  
  CREATE TABLE inhts (d int) INHERITS (inht2, inhs1);
  NOTICE:  merging multiple inherited definitions of column "b"
@@@ -1175,18 -1037,16 +1233,18 @@@ ALTER TABLE inht1 RENAME aaa TO aaaa
  ALTER TABLE inht1 RENAME b TO bb;                -- to be failed
  ERROR:  cannot rename inherited column "b"
  \d+ inhts
-                         Table "public.inhts"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  aaaa   | integer |           | plain   |              | 
-  b      | integer |           | plain   |              | 
-  x      | integer |           | plain   |              | 
-  c      | integer |           | plain   |              | 
-  d      | integer |           | plain   |              | 
+                                    Table "public.inhts"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  aaaa   | integer |           |          |         | plain   |              | 
+  b      | integer |           |          |         | plain   |              | 
+  x      | integer |           |          |         | plain   |              | 
+  c      | integer |           |          |         | plain   |              | 
+  d      | integer |           |          |         | plain   |              | 
  Inherits: inht2,
            inhs1
 +Distribute By: HASH(aaaa)
 +Location Nodes: ALL DATANODES
  
  WITH RECURSIVE r AS (
    SELECT 'inht1'::regclass AS inhrelid
@@@ -1238,26 -1096,22 +1296,26 @@@ Location Nodes: ALL DATANODE
  
  ALTER TABLE ONLY test_constraints DROP CONSTRAINT test_constraints_val1_val2_key;
  \d+ test_constraints
-                         Table "public.test_constraints"
-  Column |       Type        | Modifiers | Storage  | Stats target | Description 
- --------+-------------------+-----------+----------+--------------+-------------
-  id     | integer           |           | plain    |              | 
-  val1   | character varying |           | extended |              | 
-  val2   | integer           |           | plain    |              | 
+                                    Table "public.test_constraints"
+  Column |       Type        | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+-------------------+-----------+----------+---------+----------+--------------+-------------
+  id     | integer           |           |          |         | plain    |              | 
+  val1   | character varying |           |          |         | extended |              | 
+  val2   | integer           |           |          |         | plain    |              | 
  Child tables: test_constraints_inh
 +Distribute By: HASH(val1)
 +Location Nodes: ALL DATANODES
  
  \d+ test_constraints_inh
-                       Table "public.test_constraints_inh"
-  Column |       Type        | Modifiers | Storage  | Stats target | Description 
- --------+-------------------+-----------+----------+--------------+-------------
-  id     | integer           |           | plain    |              | 
-  val1   | character varying |           | extended |              | 
-  val2   | integer           |           | plain    |              | 
+                                  Table "public.test_constraints_inh"
+  Column |       Type        | Collation | Nullable | Default | Storage  | Stats target | Description 
+ --------+-------------------+-----------+----------+---------+----------+--------------+-------------
+  id     | integer           |           |          |         | plain    |              | 
+  val1   | character varying |           |          |         | extended |              | 
+  val2   | integer           |           |          |         | plain    |              | 
  Inherits: test_constraints
 +Distribute By: HASH(val1)
 +Location Nodes: ALL DATANODES
  
  DROP TABLE test_constraints_inh;
  DROP TABLE test_constraints;
@@@ -1279,22 -1131,18 +1337,22 @@@ Location Nodes: ALL DATANODE
  
  ALTER TABLE test_ex_constraints DROP CONSTRAINT test_ex_constraints_c_excl;
  \d+ test_ex_constraints
-                  Table "public.test_ex_constraints"
-  Column |  Type  | Modifiers | Storage | Stats target | Description 
- --------+--------+-----------+---------+--------------+-------------
-  c      | circle |           | plain   |              | 
+                            Table "public.test_ex_constraints"
+  Column |  Type  | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+--------+-----------+----------+---------+---------+--------------+-------------
+  c      | circle |           |          |         | plain   |              | 
  Child tables: test_ex_constraints_inh
 +Distribute By: ROUND ROBIN
 +Location Nodes: ALL DATANODES
  
  \d+ test_ex_constraints_inh
-                Table "public.test_ex_constraints_inh"
-  Column |  Type  | Modifiers | Storage | Stats target | Description 
- --------+--------+-----------+---------+--------------+-------------
-  c      | circle |           | plain   |              | 
+                          Table "public.test_ex_constraints_inh"
+  Column |  Type  | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+--------+-----------+----------+---------+---------+--------------+-------------
+  c      | circle |           |          |         | plain   |              | 
  Inherits: test_ex_constraints
 +Distribute By: ROUND ROBIN
 +Location Nodes: ALL DATANODES
  
  DROP TABLE test_ex_constraints_inh;
  DROP TABLE test_ex_constraints;
@@@ -1311,14 -1159,12 +1369,14 @@@ Indexes
      "test_primary_constraints_pkey" PRIMARY KEY, btree (id)
  Referenced by:
      TABLE "test_foreign_constraints" CONSTRAINT "test_foreign_constraints_id1_fkey" FOREIGN KEY (id1) REFERENCES test_primary_constraints(id)
 +Distribute By: HASH(id)
 +Location Nodes: ALL DATANODES
  
  \d+ test_foreign_constraints
-                Table "public.test_foreign_constraints"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  id1    | integer |           | plain   |              | 
+                          Table "public.test_foreign_constraints"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  id1    | integer |           |          |         | plain   |              | 
  Foreign-key constraints:
      "test_foreign_constraints_id1_fkey" FOREIGN KEY (id1) REFERENCES test_primary_constraints(id)
  Child tables: test_foreign_constraints_inh
@@@ -1327,22 -1171,18 +1385,22 @@@ Location Nodes: ALL DATANODE
  
  ALTER TABLE test_foreign_constraints DROP CONSTRAINT test_foreign_constraints_id1_fkey;
  \d+ test_foreign_constraints
-                Table "public.test_foreign_constraints"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  id1    | integer |           | plain   |              | 
+                          Table "public.test_foreign_constraints"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  id1    | integer |           |          |         | plain   |              | 
  Child tables: test_foreign_constraints_inh
 +Distribute By: HASH(id1)
 +Location Nodes: ALL DATANODES
  
  \d+ test_foreign_constraints_inh
-              Table "public.test_foreign_constraints_inh"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  id1    | integer |           | plain   |              | 
+                        Table "public.test_foreign_constraints_inh"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  id1    | integer |           |          |         | plain   |              | 
  Inherits: test_foreign_constraints
 +Distribute By: HASH(id1)
 +Location Nodes: ALL DATANODES
  
  DROP TABLE test_foreign_constraints_inh;
  DROP TABLE test_foreign_constraints;
Simple merge
index 0ff949c272bd89a520d5d0cc1ffbb80765405618,d08b1e1ae5377471f2ea53c673169d8e02f6fe19..65a5356412f43936d64e5dabc0d4b6e15ed94a72
@@@ -5611,41 -5327,367 +5612,406 @@@ ERROR:  invalid reference to FROM-claus
  LINE 1: ...xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
                                                                  ^
  HINT:  There is an entry for table "xx1", but it cannot be referenced from this part of the query.
 +-- demonstrate problem with extrememly slow join
 +CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION;
 +INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000);
 +CREATE TABLE testh (a int, b int);
 +INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000);
 +set enable_mergejoin TO false;
 +set enable_hashjoin TO false;
 +EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
 +                                    QUERY PLAN                                     
 +-----------------------------------------------------------------------------------
 + Finalize Aggregate
 +   Output: count(*)
 +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
 +         Output: PARTIAL count(*)
 +         ->  Partial Aggregate
 +               Output: PARTIAL count(*)
 +               ->  Nested Loop Anti Join
 +                     Join Filter: (testr.b = testh.b)
 +                     ->  Remote Subquery Scan on all (datanode_1)
 +                           Output: testr.b
 +                           Distribute results by H: b
 +                           ->  Seq Scan on public.testr
 +                                 Output: testr.b
 +                     ->  Materialize
 +                           Output: testh.b
 +                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
 +                                 Output: testh.b
 +                                 Distribute results by H: b
 +                                 ->  Seq Scan on public.testh
 +                                       Output: testh.b
 +(20 rows)
 +
 +SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
 + count 
 +-------
 +  3000
 +(1 row)
 +
++
+ --
+ -- test planner's ability to mark joins as unique
+ --
+ create table j1 (id int primary key);
+ create table j2 (id int primary key);
+ create table j3 (id int);
+ insert into j1 values(1),(2),(3);
+ insert into j2 values(1),(2),(3);
+ insert into j3 values(1),(1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure join is properly marked as unique
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ -- ensure join is not unique when not an equi-join
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id > j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Nested Loop
+    Output: j1.id, j2.id
+    Join Filter: (j1.id > j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (9 rows)
+ -- ensure non-unique rel is not chosen as inner
+ explain (verbose, costs off)
+ select * from j1 inner join j3 on j1.id = j3.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Join
+    Output: j1.id, j3.id
+    Inner Unique: true
+    Hash Cond: (j3.id = j1.id)
+    ->  Seq Scan on public.j3
+          Output: j3.id
+    ->  Hash
+          Output: j1.id
+          ->  Seq Scan on public.j1
+                Output: j1.id
+ (10 rows)
+ -- ensure left join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 left join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Left Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ -- ensure right join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 right join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Left Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j2.id = j1.id)
+    ->  Seq Scan on public.j2
+          Output: j2.id
+    ->  Hash
+          Output: j1.id
+          ->  Seq Scan on public.j1
+                Output: j1.id
+ (10 rows)
+ -- ensure full join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 full join j2 on j1.id = j2.id;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Full Join
+    Output: j1.id, j2.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ -- a clauseless (cross) join can't be unique
+ explain (verbose, costs off)
+ select * from j1 cross join j2;
+             QUERY PLAN             
+ -----------------------------------
+  Nested Loop
+    Output: j1.id, j2.id
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (8 rows)
+ -- ensure a natural join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 natural join j2;
+             QUERY PLAN             
+ -----------------------------------
+  Hash Join
+    Output: j1.id
+    Inner Unique: true
+    Hash Cond: (j1.id = j2.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Hash
+          Output: j2.id
+          ->  Seq Scan on public.j2
+                Output: j2.id
+ (10 rows)
+ -- ensure a distinct clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select distinct id from j3) j3 on j1.id = j3.id;
+                   QUERY PLAN                   
+ -----------------------------------------------
+  Nested Loop
+    Output: j1.id, j3.id
+    Inner Unique: true
+    Join Filter: (j1.id = j3.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j3.id
+          ->  Unique
+                Output: j3.id
+                ->  Sort
+                      Output: j3.id
+                      Sort Key: j3.id
+                      ->  Seq Scan on public.j3
+                            Output: j3.id
+ (15 rows)
+ -- ensure group by clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
+                   QUERY PLAN                   
+ -----------------------------------------------
+  Nested Loop
+    Output: j1.id, j3.id
+    Inner Unique: true
+    Join Filter: (j1.id = j3.id)
+    ->  Seq Scan on public.j1
+          Output: j1.id
+    ->  Materialize
+          Output: j3.id
+          ->  Group
+                Output: j3.id
+                Group Key: j3.id
+                ->  Sort
+                      Output: j3.id
+                      Sort Key: j3.id
+                      ->  Seq Scan on public.j3
+                            Output: j3.id
+ (16 rows)
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- test more complex permutations of unique joins
+ create table j1 (id1 int, id2 int, primary key(id1,id2));
+ create table j2 (id1 int, id2 int, primary key(id1,id2));
+ create table j3 (id1 int, id2 int, primary key(id1,id2));
+ insert into j1 values(1,1),(1,2);
+ insert into j2 values(1,1);
+ insert into j3 values(1,1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure there's no unique join when not all columns which are part of the
+ -- unique index are seen in the join clause
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1;
+                 QUERY PLAN                
+ ------------------------------------------
+  Nested Loop
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Join Filter: (j1.id1 = j2.id1)
+    ->  Seq Scan on public.j2
+          Output: j2.id1, j2.id2
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+ (7 rows)
+ -- ensure proper unique detection with multiple join quals
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
+                         QUERY PLAN                        
+ ----------------------------------------------------------
+  Nested Loop
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Inner Unique: true
+    Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+    ->  Materialize
+          Output: j2.id1, j2.id2
+          ->  Seq Scan on public.j2
+                Output: j2.id1, j2.id2
+ (10 rows)
+ -- ensure we don't detect the join to be unique when quals are not part of the
+ -- join condition
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+                 QUERY PLAN                
+ ------------------------------------------
+  Nested Loop
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Join Filter: (j1.id1 = j2.id1)
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+          Filter: (j1.id2 = 1)
+    ->  Seq Scan on public.j2
+          Output: j2.id1, j2.id2
+ (8 rows)
+ -- as above, but for left joins.
+ explain (verbose, costs off)
+ select * from j1
+ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+                 QUERY PLAN                
+ ------------------------------------------
+  Nested Loop Left Join
+    Output: j1.id1, j1.id2, j2.id1, j2.id2
+    Join Filter: (j1.id1 = j2.id1)
+    ->  Seq Scan on public.j1
+          Output: j1.id1, j1.id2
+          Filter: (j1.id2 = 1)
+    ->  Seq Scan on public.j2
+          Output: j2.id1, j2.id2
+ (8 rows)
+ -- validate logic in merge joins which skips mark and restore.
+ -- it should only do this if all quals which were used to detect the unique
+ -- are present as join quals, and not plain quals.
+ set enable_nestloop to 0;
+ set enable_hashjoin to 0;
+ set enable_sort to 0;
+ -- create an index that will be preferred over the PK to perform the join
+ create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
+ explain (costs off) select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+                  QUERY PLAN                 
+ --------------------------------------------
+  Merge Join
+    Merge Cond: (j1.id1 = j2.id1)
+    Join Filter: (j1.id2 = j2.id2)
+    ->  Index Scan using j1_id1_idx on j1
+    ->  Index Scan using j1_id1_idx on j1 j2
+ (5 rows)
+ select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+  id1 | id2 | id1 | id2 
+ -----+-----+-----+-----
+    1 |   1 |   1 |   1
+    1 |   2 |   1 |   2
+ (2 rows)
+ reset enable_nestloop;
+ reset enable_hashjoin;
+ reset enable_sort;
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- check that semijoin inner is not seen as unique for a portion of the outerrel
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from tenk1 t3
+               where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+                                    QUERY PLAN                                    
+ ---------------------------------------------------------------------------------
+  Nested Loop
+    Output: t1.unique1, t2.hundred
+    ->  Hash Join
+          Output: t1.unique1, t3.tenthous
+          Hash Cond: (t3.thousand = t1.unique1)
+          ->  HashAggregate
+                Output: t3.thousand, t3.tenthous
+                Group Key: t3.thousand, t3.tenthous
+                ->  Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3
+                      Output: t3.thousand, t3.tenthous
+          ->  Hash
+                Output: t1.unique1
+                ->  Index Only Scan using onek_unique1 on public.onek t1
+                      Output: t1.unique1
+                      Index Cond: (t1.unique1 < 1)
+    ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+          Output: t2.hundred
+          Index Cond: (t2.hundred = t3.tenthous)
+ (18 rows)
+ -- ... unless it actually is unique
+ create table j3 as select unique1, tenthous from onek;
+ vacuum analyze j3;
+ create unique index on j3(unique1, tenthous);
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from j3
+               where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+                                QUERY PLAN                               
+ ------------------------------------------------------------------------
+  Nested Loop
+    Output: t1.unique1, t2.hundred
+    ->  Nested Loop
+          Output: t1.unique1, j3.tenthous
+          ->  Index Only Scan using onek_unique1 on public.onek t1
+                Output: t1.unique1
+                Index Cond: (t1.unique1 < 1)
+          ->  Index Only Scan using j3_unique1_tenthous_idx on public.j3
+                Output: j3.unique1, j3.tenthous
+                Index Cond: (j3.unique1 = t1.unique1)
+    ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+          Output: t2.hundred
+          Index Cond: (t2.hundred = j3.tenthous)
+ (13 rows)
+ drop table j3;
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 52457bace285c10f458ca8d65948476545d7f415,ab86595fc0219343da3374c11303e7d2d435d7a8..334a67163377b73b9956298c7afae6d0539d5598
@@@ -248,26 -470,22 +470,45 @@@ SELECT (-12345678901234567)::int8::mone
   -$12,345,678,901,234,567.00
  (1 row)
  
+ SELECT (-12345678901234567)::numeric::money;
+             money            
+ -----------------------------
+  -$12,345,678,901,234,567.00
+ (1 row)
+ -- Cast from money
+ SELECT '12345678901234567'::money::numeric;
+        numeric        
+ ----------------------
+  12345678901234567.00
+ (1 row)
+ SELECT '-12345678901234567'::money::numeric;
+         numeric        
+ -----------------------
+  -12345678901234567.00
+ (1 row)
 +INSERT INTO money_data VALUES ('$223.459');
 +INSERT INTO money_data VALUES ('$323.459');
 +INSERT INTO money_data VALUES ('$423.459');
 +INSERT INTO money_data VALUES ('$523.459');
 +SELECT sum(m) FROM money_data;
 +    sum    
 +-----------
 + $1,617.30
 +(1 row)
 +
 +CREATE TABLE money_data2 (a int, m money);
 +INSERT INTO money_data2 VALUES (1, '$123.459');
 +INSERT INTO money_data2 VALUES (2, '$223.459');
 +INSERT INTO money_data2 VALUES (3, '$323.459');
 +INSERT INTO money_data2 VALUES (4, '$423.459');
 +INSERT INTO money_data2 VALUES (5, '$523.459');
 +SELECT sum(m) FROM money_data2;
 +    sum    
 +-----------
 + $1,617.30
 +(1 row)
 +
 +DROP TABLE money_data2;
Simple merge
index e82d8f84fb01433a92f01ea9d32589f7b3d60d05,8c56512007ef9afb6f2b9ff6602e597574414396..067c945d24e7e7ba42b132e4d13fbb8eb6d857cb
@@@ -202,23 -407,77 +215,29 @@@ WITH objects (type, name, args) AS (VAL
                                -- event trigger
                                ('policy', '{addr_nsp, gentable, genpol}', '{}'),
                                ('transform', '{int}', '{sql}'),
-                               ('access method', '{btree}', '{}')
+                               ('access method', '{btree}', '{}'),
+                               ('publication', '{addr_pub}', '{}'),
+                               ('publication relation', '{addr_nsp, gentable}', '{addr_pub}'),
+                               ('subscription', '{addr_sub}', '{}'),
+                               ('statistics object', '{addr_nsp, gentable_stat}', '{}')
          )
- SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.subobjid)).*,
+ SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*,
        -- test roundtrip through pg_identify_object_as_address
-       ROW(pg_identify_object(addr1.classid, addr1.objid, addr1.subobjid)) =
-       ROW(pg_identify_object(addr2.classid, addr2.objid, addr2.subobjid))
+       ROW(pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)) =
+       ROW(pg_identify_object(addr2.classid, addr2.objid, addr2.objsubid))
          FROM objects, pg_get_object_address(type, name, args) addr1,
-                       pg_identify_object_as_address(classid, objid, subobjid) ioa(typ,nms,args),
+                       pg_identify_object_as_address(classid, objid, objsubid) ioa(typ,nms,args),
                        pg_get_object_address(typ, nms, ioa.args) as addr2
 -      ORDER BY addr1.classid, addr1.objid, addr1.objsubid;
 -           type            |   schema   |       name        |                               identity                               | ?column? 
 ----------------------------+------------+-------------------+----------------------------------------------------------------------+----------
 - default acl               |            |                   | for role regress_addr_user in schema public on tables                | t
 - default acl               |            |                   | for role regress_addr_user on tables                                 | t
 - type                      | pg_catalog | _int4             | integer[]                                                            | t
 - type                      | addr_nsp   | gencomptype       | addr_nsp.gencomptype                                                 | t
 - type                      | addr_nsp   | genenum           | addr_nsp.genenum                                                     | t
 - type                      | addr_nsp   | gendomain         | addr_nsp.gendomain                                                   | t
 - function                  | pg_catalog |                   | pg_catalog.pg_identify_object(pg_catalog.oid,pg_catalog.oid,integer) | t
 - aggregate                 | addr_nsp   |                   | addr_nsp.genaggr(integer)                                            | t
 - sequence                  | addr_nsp   | gentable_a_seq    | addr_nsp.gentable_a_seq                                              | t
 - table                     | addr_nsp   | gentable          | addr_nsp.gentable                                                    | t
 - table column              | addr_nsp   | gentable          | addr_nsp.gentable.b                                                  | t
 - index                     | addr_nsp   | gentable_pkey     | addr_nsp.gentable_pkey                                               | t
 - view                      | addr_nsp   | genview           | addr_nsp.genview                                                     | t
 - materialized view         | addr_nsp   | genmatview        | addr_nsp.genmatview                                                  | t
 - foreign table             | addr_nsp   | genftable         | addr_nsp.genftable                                                   | t
 - foreign table column      | addr_nsp   | genftable         | addr_nsp.genftable.a                                                 | t
 - role                      |            | regress_addr_user | regress_addr_user                                                    | t
 - server                    |            | addr_fserv        | addr_fserv                                                           | t
 - user mapping              |            |                   | regress_addr_user on server integer                                  | t
 - foreign-data wrapper      |            | addr_fdw          | addr_fdw                                                             | t
 - access method             |            | btree             | btree                                                                | t
 - operator of access method |            |                   | operator 1 (integer, integer) of pg_catalog.integer_ops USING btree  | t
 - function of access method |            |                   | function 2 (integer, integer) of pg_catalog.integer_ops USING btree  | t
 - default value             |            |                   | for addr_nsp.gentable.b                                              | t
 - cast                      |            |                   | (bigint AS integer)                                                  | t
 - table constraint          | addr_nsp   |                   | a_chk on addr_nsp.gentable                                           | t
 - domain constraint         | addr_nsp   |                   | domconstr on addr_nsp.gendomain                                      | t
 - conversion                | pg_catalog | ascii_to_mic      | pg_catalog.ascii_to_mic                                              | t
 - language                  |            | plpgsql           | plpgsql                                                              | t
 - schema                    |            | addr_nsp          | addr_nsp                                                             | t
 - operator class            | pg_catalog | int4_ops          | pg_catalog.int4_ops USING btree                                      | t
 - operator                  | pg_catalog |                   | pg_catalog.+(integer,integer)                                        | t
 - rule                      |            |                   | "_RETURN" on addr_nsp.genview                                        | t
 - trigger                   |            |                   | t on addr_nsp.gentable                                               | t
 - operator family           | pg_catalog | integer_ops       | pg_catalog.integer_ops USING btree                                   | t
 - policy                    |            |                   | genpol on addr_nsp.gentable                                          | t
 - statistics object         | addr_nsp   | gentable_stat     | addr_nsp.gentable_stat                                               | t
 - collation                 | pg_catalog | "default"         | pg_catalog."default"                                                 | t
 - transform                 |            |                   | for integer on language sql                                          | t
 - text search dictionary    | addr_nsp   | addr_ts_dict      | addr_nsp.addr_ts_dict                                                | t
 - text search parser        | addr_nsp   | addr_ts_prs       | addr_nsp.addr_ts_prs                                                 | t
 - text search configuration | addr_nsp   | addr_ts_conf      | addr_nsp.addr_ts_conf                                                | t
 - text search template      | addr_nsp   | addr_ts_temp      | addr_nsp.addr_ts_temp                                                | t
 - subscription              |            | addr_sub          | addr_sub                                                             | t
 - publication               |            | addr_pub          | addr_pub                                                             | t
 - publication relation      |            |                   | gentable in publication addr_pub                                     | t
 -(46 rows)
 -
 +      ORDER BY addr1.classid, addr1.objid, addr1.subobjid;
 +ERROR:  relation "addr_nsp.genftable" does not exist
  ---
  --- Cleanup resources
  ---
  SET client_min_messages TO 'warning';
  DROP FOREIGN DATA WRAPPER addr_fdw CASCADE;
 +ERROR:  foreign-data wrapper "addr_fdw" does not exist
+ DROP PUBLICATION addr_pub;
+ DROP SUBSCRIPTION addr_sub;
  DROP SCHEMA addr_nsp CASCADE;
  DROP OWNED BY regress_addr_user;
  DROP USER regress_addr_user;
index 96d91fe48671909d8dfb715a8041937a80d01c62,7ebbde60d3034998abc217928ea860b56cf64e24..cab92b1ac01627458286a585582aa02812649851
@@@ -4632,14 -4632,15 +4633,15 @@@ begi
    get diagnostics rc = row_count;
    raise notice '% %', found, rc;
    return query execute 'values(10),(20)';
-   get diagnostics rc = row_count;
-   raise notice '% %', found, rc;
+   -- just for fun, let's use array elements as targets
+   get diagnostics rca[1] = row_count;
+   raise notice '% %', found, rca[1];
    return query execute 'select * from (values(10),(20)) f(a) where false';
-   get diagnostics rc = row_count;
-   raise notice '% %', found, rc;
+   get diagnostics rca[2] = row_count;
+   raise notice '% %', found, rca[2];
  end;
  $$ language plpgsql;
 -select * from rttest();
 +select * from rttest() order by 1;
  NOTICE:  t 2
  NOTICE:  f 0
  NOTICE:  t 2
index ce5a30c2eba269d2680a8d9edf0beda9edf2c985,eb77c18788e35ebf61e31183cbeb3e2b36780c30..ff804e1341606242e5225f7bb213f724d86c15d3
@@@ -258,14 -215,21 +256,19 @@@ ERROR:  could not obtain lock on relati
  rollback;
  -- Commit table creation
  COMMIT PREPARED 'regress-one';
 +ERROR:  prepared transaction with identifier "regress-one" does not exist
  \d pxtest2
+               Table "public.pxtest2"
+  Column |  Type   | Collation | Nullable | Default 
+ --------+---------+-----------+----------+---------
+  a      | integer |           |          | 
  SELECT * FROM pxtest2;
 - a 
 ----
 - 1
 - 3
 -(2 rows)
 -
 +ERROR:  relation "pxtest2" does not exist
 +LINE 1: SELECT * FROM pxtest2;
 +                      ^
  -- There should be one prepared transaction
 -SELECT gid FROM pg_prepared_xacts;
 +SELECT gid FROM pg_prepared_xacts ORDER BY 1;
       gid     
  -------------
   regress-two
index 599721a58bf59d7465e83deb4f43f12e6d5d372b,3262aa1d100aac051d461f8996125437d20da904..0c097c445f53b823d1c2719d624e5136bcb589aa
@@@ -1499,10 -1724,15 +1677,10 @@@ DROP TABLE atest6
  DROP TABLE atestc;
  DROP TABLE atestp1;
  DROP TABLE atestp2;
- SELECT lo_unlink(oid) FROM pg_largeobject_metadata;
+ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3000 ORDER BY oid;
   lo_unlink 
  -----------
 -         1
 -         1
 -         1
 -         1
 -         1
 -(5 rows)
 +(0 rows)
  
  DROP GROUP regress_group1;
  DROP GROUP regress_group2;
index 2211308dc835dc195c1d81a06b56908152829acd,7bf29368d0fecf643cfce0293e6ffe92506d5791..67fd53a2a06a14e62644cae798e84aeee9ec2c6e
@@@ -86,16 -162,23 +161,17 @@@ SELECT * FROM document WHERE f_leak(dti
     4 |  44 |      1 | regress_rls_bob   | my first manga
     6 |  22 |      1 | regress_rls_carol | great science fiction
     8 |  44 |      1 | regress_rls_carol | great manga
- (4 rows)
+    9 |  22 |      1 | regress_rls_dave  | awesome science fiction
+ (5 rows)
  
  SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
 -NOTICE:  f_leak => my first novel
 -NOTICE:  f_leak => my first manga
 -NOTICE:  f_leak => great science fiction
 -NOTICE:  f_leak => great manga
 -NOTICE:  f_leak => awesome science fiction
 - cid | did | dlevel |      dauthor      |         dtitle          |      cname      
 ------+-----+--------+-------------------+-------------------------+-----------------
 -  11 |   1 |      1 | regress_rls_bob   | my first novel          | novel
 -  44 |   4 |      1 | regress_rls_bob   | my first manga          | manga
 -  22 |   6 |      1 | regress_rls_carol | great science fiction   | science fiction
 -  44 |   8 |      1 | regress_rls_carol | great manga             | manga
 -  22 |   9 |      1 | regress_rls_dave  | awesome science fiction | science fiction
 -(5 rows)
 + cid | did | dlevel |      dauthor      |        dtitle         |      cname      
 +-----+-----+--------+-------------------+-----------------------+-----------------
 +  11 |   1 |      1 | regress_rls_bob   | my first novel        | novel
 +  44 |   4 |      1 | regress_rls_bob   | my first manga        | manga
 +  22 |   6 |      1 | regress_rls_carol | great science fiction | science fiction
 +  44 |   8 |      1 | regress_rls_carol | great manga           | manga
 +(4 rows)
  
  -- try a sampled version
  SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0)
@@@ -119,62 -218,147 +195,71 @@@ SELECT * FROM document WHERE f_leak(dti
     6 |  22 |      1 | regress_rls_carol | great science fiction
     7 |  33 |      2 | regress_rls_carol | great technology book
     8 |  44 |      1 | regress_rls_carol | great manga
- (8 rows)
+    9 |  22 |      1 | regress_rls_dave  | awesome science fiction
+   10 |  33 |      2 | regress_rls_dave  | awesome technology book
+ (10 rows)
  
  SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
 -NOTICE:  f_leak => my first novel
 -NOTICE:  f_leak => my second novel
 -NOTICE:  f_leak => my science fiction
 -NOTICE:  f_leak => my first manga
 -NOTICE:  f_leak => my second manga
 -NOTICE:  f_leak => great science fiction
 -NOTICE:  f_leak => great technology book
 -NOTICE:  f_leak => great manga
 -NOTICE:  f_leak => awesome science fiction
 -NOTICE:  f_leak => awesome technology book
 - cid | did | dlevel |      dauthor      |         dtitle          |      cname      
 ------+-----+--------+-------------------+-------------------------+-----------------
 -  11 |   1 |      1 | regress_rls_bob   | my first novel          | novel
 -  11 |   2 |      2 | regress_rls_bob   | my second novel         | novel
 -  22 |   3 |      2 | regress_rls_bob   | my science fiction      | science fiction
 -  44 |   4 |      1 | regress_rls_bob   | my first manga          | manga
 -  44 |   5 |      2 | regress_rls_bob   | my second manga         | manga
 -  22 |   6 |      1 | regress_rls_carol | great science fiction   | science fiction
 -  33 |   7 |      2 | regress_rls_carol | great technology book   | technology
 -  44 |   8 |      1 | regress_rls_carol | great manga             | manga
 -  22 |   9 |      1 | regress_rls_dave  | awesome science fiction | science fiction
 -  33 |  10 |      2 | regress_rls_dave  | awesome technology book | technology
 -(10 rows)
 + cid | did | dlevel |      dauthor      |        dtitle         |      cname      
 +-----+-----+--------+-------------------+-----------------------+-----------------
 +  11 |   1 |      1 | regress_rls_bob   | my first novel        | novel
 +  11 |   2 |      2 | regress_rls_bob   | my second novel       | novel
 +  22 |   3 |      2 | regress_rls_bob   | my science fiction    | science fiction
 +  44 |   4 |      1 | regress_rls_bob   | my first manga        | manga
 +  44 |   5 |      2 | regress_rls_bob   | my second manga       | manga
 +  22 |   6 |      1 | regress_rls_carol | great science fiction | science fiction
 +  33 |   7 |      2 | regress_rls_carol | great technology book | technology
 +  44 |   8 |      1 | regress_rls_carol | great manga           | manga
 +(8 rows)
  
  -- try a sampled version
  SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0)
    WHERE f_leak(dtitle) ORDER BY did;
 -NOTICE:  f_leak => my first manga
 -NOTICE:  f_leak => my second manga
 -NOTICE:  f_leak => great science fiction
 -NOTICE:  f_leak => great manga
 -NOTICE:  f_leak => awesome science fiction
 - did | cid | dlevel |      dauthor      |         dtitle          
 ------+-----+--------+-------------------+-------------------------
 -   4 |  44 |      1 | regress_rls_bob   | my first manga
 -   5 |  44 |      2 | regress_rls_bob   | my second manga
 + did | cid | dlevel |      dauthor      |        dtitle         
 +-----+-----+--------+-------------------+-----------------------
     6 |  22 |      1 | regress_rls_carol | great science fiction
     8 |  44 |      1 | regress_rls_carol | great manga
 -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
 -(5 rows)
 +(2 rows)
  
  EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle);
 -                     QUERY PLAN                     
 -----------------------------------------------------
 - Seq Scan on document
 -   Filter: ((dlevel <= $0) AND f_leak(dtitle))
 -   InitPlan 1 (returns $0)
 -     ->  Index Scan using uaccount_pkey on uaccount
 -           Index Cond: (pguser = CURRENT_USER)
 -(5 rows)
 -
 -EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle);
 -                        QUERY PLAN                         
 ------------------------------------------------------------
 - Hash Join
 -   Hash Cond: (category.cid = document.cid)
 -   InitPlan 1 (returns $0)
 -     ->  Index Scan using uaccount_pkey on uaccount
 -           Index Cond: (pguser = CURRENT_USER)
 -   ->  Seq Scan on category
 -   ->  Hash
 -         ->  Seq Scan on document
 -               Filter: ((dlevel <= $0) AND f_leak(dtitle))
 +                               QUERY PLAN                                
 +-------------------------------------------------------------------------
 + Remote Subquery Scan on all (datanode_1,datanode_2)
 +   ->  Subquery Scan on document
 +         Filter: f_leak(document.dtitle)
 +         ->  Seq Scan on document document_1
 +               Filter: (dlevel <= $0)
 +               InitPlan 1 (returns $0)
 +                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
 +                       ->  Index Scan using uaccount_pkey on uaccount
 +                             Index Cond: (pguser = "current_user"())
  (9 rows)
  
 --- viewpoint from regress_rls_dave
 -SET SESSION AUTHORIZATION regress_rls_dave;
 -SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did;
 -NOTICE:  f_leak => my first novel
 -NOTICE:  f_leak => my second novel
 -NOTICE:  f_leak => my science fiction
 -NOTICE:  f_leak => great science fiction
 -NOTICE:  f_leak => great technology book
 -NOTICE:  f_leak => awesome science fiction
 -NOTICE:  f_leak => awesome technology book
 - did | cid | dlevel |      dauthor      |         dtitle          
 ------+-----+--------+-------------------+-------------------------
 -   1 |  11 |      1 | regress_rls_bob   | my first novel
 -   2 |  11 |      2 | regress_rls_bob   | my second novel
 -   3 |  22 |      2 | regress_rls_bob   | my science fiction
 -   6 |  22 |      1 | regress_rls_carol | great science fiction
 -   7 |  33 |      2 | regress_rls_carol | great technology book
 -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
 -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
 -(7 rows)
 -
 -SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
 -NOTICE:  f_leak => my first novel
 -NOTICE:  f_leak => my second novel
 -NOTICE:  f_leak => my science fiction
 -NOTICE:  f_leak => great science fiction
 -NOTICE:  f_leak => great technology book
 -NOTICE:  f_leak => awesome science fiction
 -NOTICE:  f_leak => awesome technology book
 - cid | did | dlevel |      dauthor      |         dtitle          |      cname      
 ------+-----+--------+-------------------+-------------------------+-----------------
 -  11 |   1 |      1 | regress_rls_bob   | my first novel          | novel
 -  11 |   2 |      2 | regress_rls_bob   | my second novel         | novel
 -  22 |   3 |      2 | regress_rls_bob   | my science fiction      | science fiction
 -  22 |   6 |      1 | regress_rls_carol | great science fiction   | science fiction
 -  33 |   7 |      2 | regress_rls_carol | great technology book   | technology
 -  22 |   9 |      1 | regress_rls_dave  | awesome science fiction | science fiction
 -  33 |  10 |      2 | regress_rls_dave  | awesome technology book | technology
 -(7 rows)
 -
 -EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle);
 -                                          QUERY PLAN                                          
 -----------------------------------------------------------------------------------------------
 - Seq Scan on document
 -   Filter: ((cid <> 44) AND (cid <> 44) AND (cid < 50) AND (dlevel <= $0) AND f_leak(dtitle))
 -   InitPlan 1 (returns $0)
 -     ->  Index Scan using uaccount_pkey on uaccount
 -           Index Cond: (pguser = CURRENT_USER)
 -(5 rows)
 -
  EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle);
 -                                                QUERY PLAN                                                
 -----------------------------------------------------------------------------------------------------------
 - Hash Join
 -   Hash Cond: (category.cid = document.cid)
 -   InitPlan 1 (returns $0)
 -     ->  Index Scan using uaccount_pkey on uaccount
 -           Index Cond: (pguser = CURRENT_USER)
 -   ->  Seq Scan on category
 -   ->  Hash
 -         ->  Seq Scan on document
 -               Filter: ((cid <> 44) AND (cid <> 44) AND (cid < 50) AND (dlevel <= $0) AND f_leak(dtitle))
 -(9 rows)
 +                                     QUERY PLAN                                      
 +-------------------------------------------------------------------------------------
 + Remote Subquery Scan on all (datanode_1,datanode_2)
 +   ->  Hash Join
 +         Hash Cond: (category.cid = document.cid)
 +         ->  Seq Scan on category
 +         ->  Hash
 +               ->  Subquery Scan on document
 +                     Filter: f_leak(document.dtitle)
 +                     ->  Seq Scan on document document_1
 +                           Filter: (dlevel <= $0)
 +                           InitPlan 1 (returns $0)
 +                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
 +                                   ->  Index Scan using uaccount_pkey on uaccount
 +                                         Index Cond: (pguser = "current_user"())
 +(13 rows)
  
+ -- 44 would technically fail for both p2r and p1r, but we should get an error
+ -- back from p1r for this because it sorts first
+ INSERT INTO document VALUES (100, 44, 1, 'regress_rls_dave', 'testing sorting of policies'); -- fail
+ ERROR:  new row violates row-level security policy "p1r" for table "document"
+ -- Just to see a p2r error
+ INSERT INTO document VALUES (100, 55, 1, 'regress_rls_dave', 'testing sorting of policies'); -- fail
+ ERROR:  new row violates row-level security policy "p2r" for table "document"
  -- only owner can change policies
  ALTER POLICY p1 ON document USING (true);    --fail
  ERROR:  must be owner of relation document
@@@ -254,15 -448,15 +339,18 @@@ CREATE POLICY p2 ON categor
  ALTER TABLE category ENABLE ROW LEVEL SECURITY;
  -- cannot delete PK referenced by invisible FK
  SET SESSION AUTHORIZATION regress_rls_bob;
- SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid;
+ SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid ORDER BY d.did, c.cid;
   did | cid | dlevel |     dauthor     |       dtitle       | cid |   cname    
  -----+-----+--------+-----------------+--------------------+-----+------------
 -   1 |  11 |      1 | regress_rls_bob | my first novel     |  11 | novel
 +   4 |  44 |      1 | regress_rls_bob | my first manga     |     | 
 +   5 |  44 |      2 | regress_rls_bob | my second manga    |     | 
     2 |  11 |      2 | regress_rls_bob | my second novel    |  11 | novel
 +   1 |  11 |      1 | regress_rls_bob | my first novel     |  11 | novel
 +     |     |        |                 |                    |  33 | technology
     3 |  22 |      2 | regress_rls_bob | my science fiction |     | 
+    4 |  44 |      1 | regress_rls_bob | my first manga     |     | 
+    5 |  44 |      2 | regress_rls_bob | my second manga    |     | 
+      |     |        |                 |                    |  33 | technology
  (6 rows)
  
  DELETE FROM category WHERE cid = 33;    -- fails with FK violation
@@@ -270,15 -464,15 +358,16 @@@ ERROR:  update or delete on table "cate
  DETAIL:  Key is still referenced from table "document".
  -- can insert FK referencing invisible PK
  SET SESSION AUTHORIZATION regress_rls_carol;
- SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid;
+ SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid ORDER BY d.did, c.cid;
   did | cid | dlevel |      dauthor      |        dtitle         | cid |      cname      
  -----+-----+--------+-------------------+-----------------------+-----+-----------------
 +   8 |  44 |      1 | regress_rls_carol | great manga           |  44 | manga
     6 |  22 |      1 | regress_rls_carol | great science fiction |  22 | science fiction
     7 |  33 |      2 | regress_rls_carol | great technology book |     | 
+    8 |  44 |      1 | regress_rls_carol | great manga           |  44 | manga
  (3 rows)
  
- INSERT INTO document VALUES (10, 33, 1, current_user, 'hoge');
+ INSERT INTO document VALUES (11, 33, 1, current_user, 'hoge');
  -- UNIQUE or PRIMARY KEY constraint violation DOES reveal presence of row
  SET SESSION AUTHORIZATION regress_rls_bob;
  INSERT INTO document VALUES (8, 44, 1, 'regress_rls_bob', 'my third manga'); -- Must fail with unique violation, revealing presence of did we can't see
@@@ -298,18 -491,20 +387,18 @@@ DETAIL:  correlated UPDATE or updating 
  RESET SESSION AUTHORIZATION;
  SET row_security TO ON;
  SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
     1 |  11 |      1 | regress_rls_bob   | my first novel
     2 |  11 |      2 | regress_rls_bob   | my second novel
 -   3 |  22 |      2 | regress_rls_bob   | my science fiction
 -   4 |  44 |      1 | regress_rls_bob   | my first manga
     5 |  44 |      2 | regress_rls_bob   | my second manga
     6 |  22 |      1 | regress_rls_carol | great science fiction
 -   7 |  33 |      2 | regress_rls_carol | great technology book
     8 |  44 |      1 | regress_rls_carol | great manga
 -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
 -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
 -  11 |  33 |      1 | regress_rls_carol | hoge
 -(11 rows)
 +   3 |  22 |      2 | regress_rls_bob   | my science fiction
 +   4 |  44 |      1 | regress_rls_bob   | my first manga
 +   7 |  33 |      2 | regress_rls_carol | great technology book
 +  10 |  33 |      1 | regress_rls_carol | hoge
 +(9 rows)
  
  SELECT * FROM category;
   cid |      cname      
  RESET SESSION AUTHORIZATION;
  SET row_security TO OFF;
  SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
     1 |  11 |      1 | regress_rls_bob   | my first novel
     2 |  11 |      2 | regress_rls_bob   | my second novel
 -   3 |  22 |      2 | regress_rls_bob   | my science fiction
 -   4 |  44 |      1 | regress_rls_bob   | my first manga
     5 |  44 |      2 | regress_rls_bob   | my second manga
     6 |  22 |      1 | regress_rls_carol | great science fiction
 -   7 |  33 |      2 | regress_rls_carol | great technology book
     8 |  44 |      1 | regress_rls_carol | great manga
 -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
 -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
 -  11 |  33 |      1 | regress_rls_carol | hoge
 -(11 rows)
 +   3 |  22 |      2 | regress_rls_bob   | my science fiction
 +   4 |  44 |      1 | regress_rls_bob   | my first manga
 +   7 |  33 |      2 | regress_rls_carol | great technology book
 +  10 |  33 |      1 | regress_rls_carol | hoge
 +(9 rows)
  
  SELECT * FROM category;
   cid |      cname      
  SET SESSION AUTHORIZATION regress_rls_exempt_user;
  SET row_security TO OFF;
  SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
     1 |  11 |      1 | regress_rls_bob   | my first novel
     2 |  11 |      2 | regress_rls_bob   | my second novel
 -   3 |  22 |      2 | regress_rls_bob   | my science fiction
 -   4 |  44 |      1 | regress_rls_bob   | my first manga
     5 |  44 |      2 | regress_rls_bob   | my second manga
     6 |  22 |      1 | regress_rls_carol | great science fiction
 -   7 |  33 |      2 | regress_rls_carol | great technology book
     8 |  44 |      1 | regress_rls_carol | great manga
 -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
 -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
 -  11 |  33 |      1 | regress_rls_carol | hoge
 -(11 rows)
 +   3 |  22 |      2 | regress_rls_bob   | my science fiction
 +   4 |  44 |      1 | regress_rls_bob   | my first manga
 +   7 |  33 |      2 | regress_rls_carol | great technology book
 +  10 |  33 |      1 | regress_rls_carol | hoge
 +(9 rows)
  
  SELECT * FROM category;
   cid |      cname      
  SET SESSION AUTHORIZATION regress_rls_alice;
  SET row_security TO ON;
  SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
     1 |  11 |      1 | regress_rls_bob   | my first novel
     2 |  11 |      2 | regress_rls_bob   | my second novel
 -   3 |  22 |      2 | regress_rls_bob   | my science fiction
 -   4 |  44 |      1 | regress_rls_bob   | my first manga
     5 |  44 |      2 | regress_rls_bob   | my second manga
     6 |  22 |      1 | regress_rls_carol | great science fiction
 -   7 |  33 |      2 | regress_rls_carol | great technology book
     8 |  44 |      1 | regress_rls_carol | great manga
 -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
 -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
 -  11 |  33 |      1 | regress_rls_carol | hoge
 -(11 rows)
 +   3 |  22 |      2 | regress_rls_bob   | my science fiction
 +   4 |  44 |      1 | regress_rls_bob   | my first manga
 +   7 |  33 |      2 | regress_rls_carol | great technology book
 +  10 |  33 |      1 | regress_rls_carol | hoge
 +(9 rows)
  
  SELECT * FROM category;
   cid |      cname      
  SET SESSION AUTHORIZATION regress_rls_alice;
  SET row_security TO OFF;
  SELECT * FROM document;
-  did | cid | dlevel |      dauthor      |        dtitle         
- -----+-----+--------+-------------------+-----------------------
+  did | cid | dlevel |      dauthor      |         dtitle          
+ -----+-----+--------+-------------------+-------------------------
     1 |  11 |      1 | regress_rls_bob   | my first novel
     2 |  11 |      2 | regress_rls_bob   | my second novel
 -   3 |  22 |      2 | regress_rls_bob   | my science fiction
 -   4 |  44 |      1 | regress_rls_bob   | my first manga
     5 |  44 |      2 | regress_rls_bob   | my second manga
     6 |  22 |      1 | regress_rls_carol | great science fiction
 -   7 |  33 |      2 | regress_rls_carol | great technology book
     8 |  44 |      1 | regress_rls_carol | great manga
 -   9 |  22 |      1 | regress_rls_dave  | awesome science fiction
 -  10 |  33 |      2 | regress_rls_dave  | awesome technology book
 -  11 |  33 |      1 | regress_rls_carol | hoge
 -(11 rows)
 +   3 |  22 |      2 | regress_rls_bob   | my science fiction
 +   4 |  44 |      1 | regress_rls_bob   | my first manga
 +   7 |  33 |      2 | regress_rls_carol | great technology book
 +  10 |  33 |      1 | regress_rls_carol | hoge
 +(9 rows)
  
  SELECT * FROM category;
   cid |      cname      
@@@ -1769,15 -1982,12 +1859,14 @@@ SELECT * FROM rls_view
  (2 rows)
  
  EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 -               QUERY PLAN                
 ------------------------------------------
 - Seq Scan on z1
 -   Filter: (((a % 2) = 0) AND f_leak(b))
 -(2 rows)
 -
 +                     QUERY PLAN                      
 +-----------------------------------------------------
 + Remote Subquery Scan on all (datanode_1,datanode_2)
 +   ->  Subquery Scan on z1
 +         Filter: f_leak(z1.b)
 +         ->  Seq Scan on z1 z1_1
 +               Filter: ((a % 2) = 0)
 +(5 rows)
  -- Query as role that is not owner of table but is owner of view.
  -- Should return records based on view owner policies.
  SET SESSION AUTHORIZATION regress_rls_bob;
index 40d6060eafd926150f32c76dce053ca1bfda299b,912360d70af0bf63796666ce1b1b2da6970b86b6..8f733b02425134f501ca0d78331eb626ec812937
@@@ -947,10 -938,10 +947,10 @@@ CREATE TABLE shoe_data 
        shoename   char(10),      -- primary key
        sh_avail   integer,       -- available # of pairs
        slcolor    char(10),      -- preferred shoelace color
-       slminlen   float,         -- miminum shoelace length
+       slminlen   float,         -- minimum shoelace length
        slmaxlen   float,         -- maximum shoelace length
        slunit     char(8)        -- length unit
 -);
 +) distribute by roundrobin;
  CREATE TABLE shoelace_data (
        sl_name    char(10),      -- primary key
        sl_avail   integer,       -- available # of pairs
@@@ -2734,13 -2792,12 +2818,13 @@@ select * from rules_log
  (12 rows)
  
  create rule r3 as on delete to rules_src do notify rules_src_deletion;
 +ERROR:  Rule may not use NOTIFY, it is not yet supported
  \d+ rules_src
-                       Table "public.rules_src"
-  Column |  Type   | Modifiers | Storage | Stats target | Description 
- --------+---------+-----------+---------+--------------+-------------
-  f1     | integer |           | plain   |              | 
-  f2     | integer |           | plain   |              | 
+                                  Table "public.rules_src"
+  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+  f1     | integer |           |          |         | plain   |              | 
+  f2     | integer |           |          |         | plain   |              | 
  Rules:
      r1 AS
      ON UPDATE TO rules_src DO  INSERT INTO rules_log (f1, f2, tag) VALUES (old.f1,old.f2,'old'::text), (new.f1,new.f2,'new'::text)
index 80d63fd8c3c31180e5a43157259d71ed56d59e68,6750152e0f4908bcddc8bbe7cbc997ed77606722..5e541e294f07edf83bd1602d71cf0b28f7ffba2d
@@@ -67,7 -69,14 +69,14 @@@ kd_point_tbl|
  line_tbl|f
  log_table|f
  lseg_tbl|f
 -main_table|f
 +main_table|t
+ mlparted|f
+ mlparted1|f
+ mlparted11|f
+ mlparted12|f
+ mlparted2|f
+ mlparted3|f
+ mlparted4|f
  money_data|f
  num_data|f
  num_exp_add|t
@@@ -139,11 -158,9 +158,12 @@@ pg_ts_parser|
  pg_ts_template|t
  pg_type|t
  pg_user_mapping|t
 +pgxc_class|t
 +pgxc_group|t
 +pgxc_node|t
  point_tbl|t
  polygon_tbl|t
+ quad_box_tbl|t
  quad_point_tbl|t
  radix_text_tbl|t
  ramp|f
index 174d3b655f7b103dbc41a5853e551a15255b331a,1fab5136d29f6c92605407ffc8a39d16a1a739df..ef9e1c51291c6a2cdb868bd7328a8a1f6a3de08b
@@@ -759,16 -752,24 +759,26 @@@ select * from onek2 where unique2 = 11 
       494 |      11 |   0 |    2 |   4 |     14 |       4 |       94 |          94 |       494 |      494 |   8 |    9 | ATAAAA   | LAAAAA   | VVVVxx
  (1 row)
  
+ -- actually run the query with an analyze to use the partial index
+ explain (costs off, analyze on, timing off, summary off)
+ select * from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
+                            QUERY PLAN                            
+ -----------------------------------------------------------------
+  Index Scan using onek2_u2_prtl on onek2 (actual rows=1 loops=1)
+    Index Cond: (unique2 = 11)
+    Filter: (stringu1 = 'ATAAAA'::name)
+ (3 rows)
  explain (costs off)
  select unique2 from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
 -               QUERY PLAN                
 ------------------------------------------
 - Index Scan using onek2_u2_prtl on onek2
 -   Index Cond: (unique2 = 11)
 -   Filter: (stringu1 = 'ATAAAA'::name)
 -(3 rows)
 +                  QUERY PLAN                   
 +-----------------------------------------------
 + Remote Fast Query Execution
 +   Node/s: datanode_1, datanode_2
 +   ->  Index Scan using onek2_u2_prtl on onek2
 +         Index Cond: (unique2 = 11)
 +         Filter: (stringu1 = 'ATAAAA'::name)
 +(5 rows)
  
  select unique2 from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
   unique2 
index 33f3ee6f8de25ee271b17c134bcd47df16223101,3e35e96c4b3a8f88a03fedaee9783bee224d5776..580b33af5e41eece57001c2e515d028e03283a85
@@@ -98,36 -92,223 +98,244 @@@ explain (costs off
  explain (costs off)
        select  sum(parallel_restricted(unique1)) from tenk1
        group by(parallel_restricted(unique1));
 +                        QUERY PLAN                         
 +-----------------------------------------------------------
 + HashAggregate
 +   Group Key: parallel_restricted(unique1)
 +   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
 +         ->  Index Only Scan using tenk1_unique1 on tenk1
 +(4 rows)
 +
+                             QUERY PLAN                             
+ -------------------------------------------------------------------
+  HashAggregate
+    Group Key: parallel_restricted(unique1)
+    ->  Gather
+          Workers Planned: 4
+          ->  Parallel Index Only Scan using tenk1_unique1 on tenk1
+ (5 rows)
+ -- test parallel plans for queries containing un-correlated subplans.
+ alter table tenk2 set (parallel_workers = 0);
+ explain (costs off)
+       select count(*) from tenk1 where (two, four) not in
+       (select hundred, thousand from tenk2 where thousand > 100);
+                       QUERY PLAN                      
+ ------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Parallel Seq Scan on tenk1
+                      Filter: (NOT (hashed SubPlan 1))
+                      SubPlan 1
+                        ->  Seq Scan on tenk2
+                              Filter: (thousand > 100)
+ (9 rows)
+ select count(*) from tenk1 where (two, four) not in
+       (select hundred, thousand from tenk2 where thousand > 100);
+  count 
+ -------
+  10000
+ (1 row)
+ -- this is not parallel-safe due to use of random() within SubLink's testexpr:
+ explain (costs off)
+       select * from tenk1 where (unique1 + random())::integer not in
+       (select ten from tenk2);
+              QUERY PLAN             
+ ------------------------------------
+  Seq Scan on tenk1
+    Filter: (NOT (hashed SubPlan 1))
+    SubPlan 1
+      ->  Seq Scan on tenk2
+ (4 rows)
+ alter table tenk2 reset (parallel_workers);
+ -- test parallel index scans.
+ set enable_seqscan to off;
+ set enable_bitmapscan to off;
+ explain (costs off)
+       select  count((unique1)) from tenk1 where hundred > 1;
+                              QUERY PLAN                             
+ --------------------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Parallel Index Scan using tenk1_hundred on tenk1
+                      Index Cond: (hundred > 1)
+ (6 rows)
+ select  count((unique1)) from tenk1 where hundred > 1;
+  count 
+ -------
+   9800
+ (1 row)
+ -- test parallel index-only scans.
+ explain (costs off)
+       select  count(*) from tenk1 where thousand > 95;
+                                    QUERY PLAN                                   
+ --------------------------------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Parallel Index Only Scan using tenk1_thous_tenthous on tenk1
+                      Index Cond: (thousand > 95)
+ (6 rows)
+ select  count(*) from tenk1 where thousand > 95;
+  count 
+ -------
+   9040
+ (1 row)
+ reset enable_seqscan;
+ reset enable_bitmapscan;
+ -- test parallel bitmap heap scan.
+ set enable_seqscan to off;
+ set enable_indexscan to off;
+ set enable_hashjoin to off;
+ set enable_mergejoin to off;
+ set enable_material to off;
+ -- test prefetching, if the platform allows it
+ DO $$
+ BEGIN
+  SET effective_io_concurrency = 50;
+ EXCEPTION WHEN invalid_parameter_value THEN
+ END $$;
+ set work_mem='64kB';  --set small work mem to force lossy pages
+ explain (costs off)
+       select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0;
+                          QUERY PLAN                         
+ ------------------------------------------------------------
+  Aggregate
+    ->  Nested Loop
+          ->  Seq Scan on tenk2
+                Filter: (thousand = 0)
+          ->  Gather
+                Workers Planned: 4
+                ->  Parallel Bitmap Heap Scan on tenk1
+                      Recheck Cond: (hundred > 1)
+                      ->  Bitmap Index Scan on tenk1_hundred
+                            Index Cond: (hundred > 1)
+ (10 rows)
+ select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0;
+  count 
+ -------
+  98000
+ (1 row)
+ create table bmscantest (a int, t text);
+ insert into bmscantest select r, 'fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' FROM generate_series(1,100000) r;
+ create index i_bmtest ON bmscantest(a);
+ select count(*) from bmscantest where a>1;
+  count 
+ -------
+  99999
+ (1 row)
+ reset enable_seqscan;
+ reset enable_indexscan;
+ reset enable_hashjoin;
+ reset enable_mergejoin;
+ reset enable_material;
+ reset effective_io_concurrency;
+ reset work_mem;
+ drop table bmscantest;
+ -- test parallel merge join path.
+ set enable_hashjoin to off;
+ set enable_nestloop to off;
+ explain (costs off)
+       select  count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
+                                   QUERY PLAN                                   
+ -------------------------------------------------------------------------------
+  Finalize Aggregate
+    ->  Gather
+          Workers Planned: 4
+          ->  Partial Aggregate
+                ->  Merge Join
+                      Merge Cond: (tenk1.unique1 = tenk2.unique1)
+                      ->  Parallel Index Only Scan using tenk1_unique1 on tenk1
+                      ->  Index Only Scan using tenk2_unique1 on tenk2
+ (8 rows)
+ select  count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
+  count 
+ -------
+  10000
+ (1 row)
+ reset enable_hashjoin;
+ reset enable_nestloop;
+ --test gather merge
+ set enable_hashagg to off;
+ explain (costs off)
+    select  string4, count((unique2)) from tenk1 group by string4 order by string4;
+                      QUERY PLAN                     
+ ----------------------------------------------------
+  Finalize GroupAggregate
+    Group Key: string4
+    ->  Gather Merge
+          Workers Planned: 4
+          ->  Partial GroupAggregate
+                Group Key: string4
+                ->  Sort
+                      Sort Key: string4
+                      ->  Parallel Seq Scan on tenk1
+ (9 rows)
+ select  string4, count((unique2)) from tenk1 group by string4 order by string4;
+  string4 | count 
+ ---------+-------
+  AAAAxx  |  2500
+  HHHHxx  |  2500
+  OOOOxx  |  2500
+  VVVVxx  |  2500
+ (4 rows)
+ reset enable_hashagg;
  set force_parallel_mode=1;
  explain (costs off)
    select stringu1::int2 from tenk1 where unique1 = 1;
 -                  QUERY PLAN                   
 ------------------------------------------------
 - Gather
 -   Workers Planned: 1
 -   Single Copy: true
 -   ->  Index Scan using tenk1_unique1 on tenk1
 -         Index Cond: (unique1 = 1)
 -(5 rows)
 +                     QUERY PLAN                      
 +-----------------------------------------------------
 + Remote Fast Query Execution
 +   Node/s: datanode_1
 +   ->  Gather
 +         Workers Planned: 1
 +         Single Copy: true
 +         ->  Index Scan using tenk1_unique1 on tenk1
 +               Index Cond: (unique1 = 1)
 +(7 rows)
 +
 +do $$begin
 +  -- Provoke error, possibly in worker.  If this error happens to occur in
 +  -- the worker, there will be a CONTEXT line which must be hidden.
 +  perform stringu1::int2 from tenk1 where unique1 = 1;
 +  exception
 +      when others then
 +              raise 'SQLERRM: %', sqlerrm;
 +end$$;
 +ERROR:  Internal subtransactions not supported in Postgres-XL
 +CONTEXT:  PL/pgSQL function inline_code_block line 1 during statement block entry
+ -- to increase the parallel query test coverage
+ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
+                          QUERY PLAN                          
+ -------------------------------------------------------------
+  Gather (actual rows=10000 loops=1)
+    Workers Planned: 4
+    Workers Launched: 4
+    ->  Parallel Seq Scan on tenk1 (actual rows=2000 loops=5)
+ (4 rows)
+ -- provoke error in worker
+ select stringu1::int2 from tenk1 where unique1 = 1;
+ ERROR:  invalid input syntax for integer: "BAAAAA"
+ CONTEXT:  parallel worker
  rollback;
index e632ed89d23192e84f186ea35c4bd6c2c8d0d79f,bf003adf243255d5d9418ba1b7c90a575749c5aa..33b880d72b94b51d9574c9a1400b2d636177389a
@@@ -335,15 -322,924 +335,11 @@@ SELECT * FROM street ORDER BY name,cnam
   Whitlock Creek                     | [(-121.74683,37.91276),(-121.733107,37)]                                                                                                                                                                                                                                                                                                                                                                                                     | Oakland
   Whitlock Creek                     | [(-121.74683,37.91276),(-121.733107,37)]                                                                                                                                                                                                                                                                                                                                                                                                     | Oakland
   Willimet                      Way  | [(-122.0964,37.517),(-122.0949,37.493)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
 - Wisconsin                     St   | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)]                                                                                                                                                                                                                                                                                                                                                                                   | Oakland
   Wisconsin                     St   | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)]                                                                                                                                                                                                                                                                                                                                                                                   | Berkeley
 + Wisconsin                     St   | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)]                                                                                                                                                                                                                                                                                                                                                                                   | Oakland
   Wp Railroad                        | [(-122.254,37.902),(-122.2506,37.891)]                                                                                                                                                                                                                                                                                                                                                                                                       | Berkeley
 - 100th                         Ave  | [(-122.1657,37.429),(-122.1647,37.432)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
 - 107th                         Ave  | [(-122.1555,37.403),(-122.1531,37.41)]                                                                                                                                                                                                                                                                                                                                                                                                       | Oakland
 - 14th                          St   | [(-122.299,37.147),(-122.3,37.148)]                                                                                                                                                                                                                                                                                                                                                                                                          | Lafayette
 - 19th                          Ave  | [(-122.2366,37.897),(-122.2359,37.905)]                                                                                                                                                                                                                                                                                                                                                                                                      | Berkeley
 - 1st                           St   | [(-121.75508,37.89294),(-121.753581,37.90031)]                                                                                                                                                                                                                                                                                                                                                                                               | Oakland
 - 5th                           St   | [(-122.278,37),(-122.2792,37.005),(-122.2803,37.009)]                                                                                                                                                                                                                                                                                                                                                                                        | Lafayette
 - 5th                           St   | [(-122.296,37.615),(-122.2953,37.598)]                                                                                                                                                                                                                                                                                                                                                                                                       | Berkeley
 - 82nd                          Ave  | [(-122.1695,37.596),(-122.1681,37.603)]                                                                                                                                                                                                                                                                                                                                                                                                      | Berkeley
 - 85th                          Ave  | [(-122.1877,37.466),(-122.186,37.476)]                                                                                                                                                                                                                                                                                                                                                                                                       | Oakland
 - 89th                          Ave  | [(-122.1822,37.459),(-122.1803,37.471)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
 - 98th                          Ave  | [(-122.1568,37.498),(-122.1558,37.502)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
 - 98th                          Ave  | [(-122.1693,37.438),(-122.1682,37.444)]                                                                                                                                                                                                                                                                                                                                                                                                      | Oakland
 - 98th                          Ave  | [(-122.2001,37.258),(-122.1974,37.27)]                                                                                                                                                                                                                                                                                                                                                                                                       | Lafayette
  (333 rows)
  
- SELECT name, #thepath FROM iexit ORDER BY 1, 2;
-  name | ?column? 
- ------+----------
- (0 rows)
 -SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2;
 -                name                | ?column? 
 -------------------------------------+----------
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        2
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        3
 - I- 580                             |        4
 - I- 580                             |        4
 - I- 580                             |        4
 - I- 580                             |        4
 - I- 580                             |        5
 - I- 580                             |        5
 - I- 580                             |        5
 - I- 580                             |        5
 - I- 580                             |        5
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        6
 - I- 580                             |        7
 - I- 580                             |        7
 - I- 580                             |        7
 - I- 580                             |        7
 - I- 580                             |        7
 - I- 580                             |        7
 - I- 580                             |        7
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        8
 - I- 580                             |        9
 - I- 580                             |        9
 - I- 580                             |        9
 - I- 580                             |        9
 - I- 580                             |        9
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       12
 - I- 580                             |       13
 - I- 580                             |       13
 - I- 580                             |       13
 - I- 580                             |       13
 - I- 580                             |       13
 - I- 580                             |       13
 - I- 580                             |       14
 - I- 580                             |       14
 - I- 580                             |       14
 - I- 580                             |       14
 - I- 580                             |       14
 - I- 580                             |       14
 - I- 580                             |       14
 - I- 580                             |       14
 - I- 580                             |       18
 - I- 580                             |       18
 - I- 580                             |       18
 - I- 580                             |       18
 - I- 580                             |       18
 - I- 580                             |       18
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       21
 - I- 580                             |       22
 - I- 580                             |       22
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        2
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        3
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        4
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        5
 - I- 580                        Ramp |        6
 - I- 580                        Ramp |        6
 - I- 580                        Ramp |        6
 - I- 580                        Ramp |        7
 - I- 580                        Ramp |        8
 - I- 580                        Ramp |        8
 - I- 580                        Ramp |        8
 - I- 580                        Ramp |        8
 - I- 580                        Ramp |        8
 - I- 580                        Ramp |        8
 - I- 580/I-680                  Ramp |        2
 - I- 580/I-680                  Ramp |        2
 - I- 580/I-680                  Ramp |        2
 - I- 580/I-680                  Ramp |        2
 - I- 580/I-680                  Ramp |        2
 - I- 580/I-680                  Ramp |        2
 - I- 580/I-680                  Ramp |        4
 - I- 580/I-680                  Ramp |        4
 - I- 580/I-680                  Ramp |        4
 - I- 580/I-680                  Ramp |        4
 - I- 580/I-680                  Ramp |        5
 - I- 580/I-680                  Ramp |        6
 - I- 580/I-680                  Ramp |        6
 - I- 580/I-680                  Ramp |        6
 - I- 680                             |        2
 - I- 680                             |        2
 - I- 680                             |        2
 - I- 680                             |        2
 - I- 680                             |        2
 - I- 680                             |        2
 - I- 680                             |        2
 - I- 680                             |        3
 - I- 680                             |        3
 - I- 680                             |        3
 - I- 680                             |        4
 - I- 680                             |        4
 - I- 680                             |        4
 - I- 680                             |        5
 - I- 680                             |        5
 - I- 680                             |        5
 - I- 680                             |        7
 - I- 680                             |        7
 - I- 680                             |        7
 - I- 680                             |        7
 - I- 680                             |        8
 - I- 680                             |        8
 - I- 680                             |        8
 - I- 680                             |        8
 - I- 680                             |       10
 - I- 680                             |       10
 - I- 680                             |       10
 - I- 680                             |       10
 - I- 680                             |       10
 - I- 680                             |       10
 - I- 680                             |       10
 - I- 680                             |       16
 - I- 680                             |       16
 - I- 680                             |       16
 - I- 680                             |       16
 - I- 680                             |       16
 - I- 680                             |       16
 - I- 680                             |       16
 - I- 680                             |       16
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        2
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        3
 - I- 680                        Ramp |        4
 - I- 680                        Ramp |        4
 - I- 680                        Ramp |        4
 - I- 680                        Ramp |        5
 - I- 680                        Ramp |        5
 - I- 680                        Ramp |        5
 - I- 680                        Ramp |        5
 - I- 680                        Ramp |        5
 - I- 680                        Ramp |        5
 - I- 680                        Ramp |        6
 - I- 680                        Ramp |        6
 - I- 680                        Ramp |        6
 - I- 680                        Ramp |        6
 - I- 680                        Ramp |        7
 - I- 680                        Ramp |        7
 - I- 680                        Ramp |        7
 - I- 680                        Ramp |        7
 - I- 680                        Ramp |        8
 - I- 680                        Ramp |        8
 - I- 680                        Ramp |        8
 - I- 680                        Ramp |        8
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        2
 - I- 80                              |        3
 - I- 80                              |        3
 - I- 80                              |        3
 - I- 80                              |        4
 - I- 80                              |        4
 - I- 80                              |        4
 - I- 80                              |        4
 - I- 80                              |        4
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |        5
 - I- 80                              |       11
 - I- 80                              |       11
 - I- 80                              |       11
 - I- 80                              |       11
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        2
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        3
 - I- 80                         Ramp |        4
 - I- 80                         Ramp |        4
 - I- 80                         Ramp |        4
 - I- 80                         Ramp |        4
 - I- 80                         Ramp |        5
 - I- 80                         Ramp |        5
 - I- 80                         Ramp |        5
 - I- 80                         Ramp |        5
 - I- 80                         Ramp |        5
 - I- 80                         Ramp |        5
 - I- 80                         Ramp |        5
 - I- 80                         Ramp |        7
 - I- 80                         Ramp |        7
 - I- 80                         Ramp |        7
 - I- 80                         Ramp |        7
 - I- 880                             |        2
 - I- 880                             |        2
 - I- 880                             |        2
 - I- 880                             |        2
 - I- 880                             |        2
 - I- 880                             |        5
 - I- 880                             |        5
 - I- 880                             |        5
 - I- 880                             |        5
 - I- 880                             |        5
 - I- 880                             |        5
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        6
 - I- 880                             |        7
 - I- 880                             |        7
 - I- 880                             |        7
 - I- 880                             |        7
 - I- 880                             |        7
 - I- 880                             |        7
 - I- 880                             |        7
 - I- 880                             |        9
 - I- 880                             |        9
 - I- 880                             |        9
 - I- 880                             |        9
 - I- 880                             |        9
 - I- 880                             |        9
 - I- 880                             |        9
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       10
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       12
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       13
 - I- 880                             |       14
 - I- 880                             |       14
 - I- 880                             |       14
 - I- 880                             |       14
 - I- 880                             |       14
 - I- 880                             |       14
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       17
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                             |       19
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        2
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        3
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        4
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        5
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        6
 - I- 880                        Ramp |        8
 - I- 880                        Ramp |        8
 - I- 880                        Ramp |        8
 - I- 980                             |        2
 - I- 980                             |        2
 - I- 980                             |        2
 - I- 980                             |        2
 - I- 980                             |        2
 - I- 980                             |        2
 - I- 980                             |        2
 - I- 980                             |        2
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        3
 - I- 980                             |        4
 - I- 980                             |        4
 - I- 980                             |        5
 - I- 980                             |        5
 - I- 980                             |        7
 - I- 980                             |        7
 - I- 980                             |        7
 - I- 980                             |        7
 - I- 980                             |       12
 - I- 980                        Ramp |        3
 - I- 980                        Ramp |        3
 - I- 980                        Ramp |        3
 - I- 980                        Ramp |        7
 -(896 rows)
  
  SELECT * FROM toyemp WHERE name = 'sharon';
    name  | age | location | annualsal 
index d637f051cc8ac86a162038e96a28efb738a61d33,16c12f3434a05429d3e3619603da2fcadaee4b56..b57471ca5c189901a88ce2131e019760c8f2fa99
@@@ -518,23 -782,24 +783,44 @@@ SELECT * FROM information_schema.sequen
  
  DROP USER regress_seq_user;
  DROP SEQUENCE seq;
 +create table test_seqtab (unique1 int, unique2 int);
 +insert into test_seqtab select i, i from generate_series(1,1000) s(i);
 +create temp sequence testseq;
 +select distinct(nextval('testseq'))
 +  from test_seqtab order by 1 limit 10;
 + nextval 
 +---------
 +       1
 +       2
 +       3
 +       4
 +       5
 +       6
 +       7
 +       8
 +       9
 +      10
 +(10 rows)
 +
 +drop table test_seqtab;
+ -- cache tests
+ CREATE SEQUENCE test_seq1 CACHE 10;
+ SELECT nextval('test_seq1');
+  nextval 
+ ---------
+        1
+ (1 row)
+ SELECT nextval('test_seq1');
+  nextval 
+ ---------
+        2
+ (1 row)
+ SELECT nextval('test_seq1');
+  nextval 
+ ---------
+        3
+ (1 row)
+ DROP SEQUENCE test_seq1;
Simple merge
Simple merge
index 86ab675051061d9270b7628e177e51b82cd8801b,ed7d6d8034e302362d12c0d9d00926ba0d3ba94f..c355f10c4c09b806c4976f018b2bcadee3ecc69f
@@@ -898,86 -938,106 +952,189 @@@ select nextval('ts1')
        11
  (1 row)
  
 +SELECT setseed(0);
 + setseed 
 +---------
 + 
 +(1 row)
 +
 +-- DROP TABLE IF EXISTS asd ;
 +CREATE TABLE IF NOT EXISTS asd  AS
 +SELECT clientid::numeric(20),
 + (clientid / 20 )::integer::numeric(20) as userid,
 + cts + ((random()* 3600 *24 )||'sec')::interval as cts,
 + (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state,
 + 0 as dim,
 + ((ARRAY['Cat','Dog','Duck'])[(clientid / 10  )% 3  +1 ]) ::text as app_name,
 + ((ARRAY['A','B'])[(clientid / 10  )% 2  +1 ]) ::text as platform
 + FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t
 +;
 +SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid,
 +      B.state as state
 +FROM ( VALUES
 +('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') ,
 +('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03')
 +) AS D (dates)
 +JOIN
 +( SELECT DISTINCT clientid FROM asd
 +      WHERE userid=74 ) C ON True
 +INNER JOIN LATERAL (
 +      SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.*
 +      FROM asd x
 +      INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim ,
 +           MAX(p.cts) AS selected_cts
 +              FROM asd p
 +              where cts<D.dates::timestamp and state in
 +              ('A','B')
 +      GROUP BY p.clientid,p.app_name,p.platform,p.state,p.dim) y
 +      ON y.clientid = x.clientid
 +      AND y.selected_cts = x.cts
 +      AND y.platform = x.platform
 +      AND y.app_name=x.app_name
 +      AND y.state=x.state
 +      AND y.dim = x.dim
 +      and x.clientid = C.clientid
 +) B ON True
 +ORDER BY dates desc, state;
 +          dates           | platform | app_name | clientid | userid | state 
 +--------------------------+----------+----------+----------+--------+-------
 + Tue Aug 30 08:52:43 2016 | A        | Dog      |     1480 |     74 | A
 + Tue Aug 30 08:52:43 2016 | B        | Duck     |     1490 |     74 | A
 + Tue Aug 30 08:52:43 2016 | A        | Dog      |     1480 |     74 | B
 + Tue Aug 30 08:52:43 2016 | B        | Duck     |     1490 |     74 | B
 + Mon Aug 29 04:57:12 2016 | A        | Dog      |     1480 |     74 | A
 + Mon Aug 29 04:57:12 2016 | B        | Duck     |     1490 |     74 | A
 + Mon Aug 29 04:57:12 2016 | A        | Dog      |     1480 |     74 | B
 + Mon Aug 29 04:57:12 2016 | B        | Duck     |     1490 |     74 | B
 + Fri Aug 26 08:15:05 2016 | B        | Duck     |     1490 |     74 | A
 + Fri Aug 26 08:15:05 2016 | A        | Dog      |     1480 |     74 | A
 + Fri Aug 26 08:15:05 2016 | B        | Duck     |     1490 |     74 | B
 + Fri Aug 26 08:15:05 2016 | A        | Dog      |     1480 |     74 | B
 + Wed Aug 24 11:49:51 2016 | A        | Dog      |     1480 |     74 | A
 + Wed Aug 24 11:49:51 2016 | B        | Duck     |     1490 |     74 | A
 + Wed Aug 24 11:49:51 2016 | A        | Dog      |     1480 |     74 | B
 + Wed Aug 24 11:49:51 2016 | B        | Duck     |     1490 |     74 | B
 + Mon Aug 22 08:45:29 2016 | B        | Duck     |     1490 |     74 | A
 + Mon Aug 22 08:45:29 2016 | A        | Dog      |     1480 |     74 | A
 + Mon Aug 22 08:45:29 2016 | B        | Duck     |     1490 |     74 | B
 + Mon Aug 22 08:45:29 2016 | A        | Dog      |     1480 |     74 | B
 + Sun Aug 21 04:53:47 2016 | B        | Duck     |     1490 |     74 | A
 + Sun Aug 21 04:53:47 2016 | A        | Dog      |     1480 |     74 | A
 + Sun Aug 21 04:53:47 2016 | B        | Duck     |     1490 |     74 | B
 + Sun Aug 21 04:53:47 2016 | A        | Dog      |     1480 |     74 | B
 + Sat Aug 20 08:44:03 2016 | A        | Dog      |     1480 |     74 | A
 + Sat Aug 20 08:44:03 2016 | B        | Duck     |     1490 |     74 | A
 + Sat Aug 20 08:44:03 2016 | B        | Duck     |     1490 |     74 | B
 + Sat Aug 20 08:44:03 2016 | A        | Dog      |     1480 |     74 | B
 +(28 rows)
 +
 +DROP TABLE asd;
 +SELECT setseed(0);
 + setseed 
 +---------
 + 
 +(1 row)
 +
+ --
+ -- Check that volatile quals aren't pushed down past a set-returning function;
+ -- while a nonvolatile qual can be, if it doesn't reference the SRF.
+ --
+ create function tattle(x int, y int) returns bool
+ volatile language plpgsql as $$
+ begin
+   raise notice 'x = %, y = %', x, y;
+   return x > y;
+ end$$;
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+                         QUERY PLAN                        
+ ----------------------------------------------------------
+  Subquery Scan on ss
+    Output: x, u
+    Filter: tattle(ss.x, 8)
+    ->  ProjectSet
+          Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+          ->  Result
+ (6 rows)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+ NOTICE:  x = 9, y = 8
+  x | u  
+ ---+----
+  9 |  1
+  9 |  2
+  9 |  3
+  9 | 11
+  9 | 12
+  9 | 13
+ (6 rows)
+ -- if we pretend it's stable, we get different results:
+ alter function tattle(x int, y int) stable;
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+                      QUERY PLAN                     
+ ----------------------------------------------------
+  ProjectSet
+    Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+    ->  Result
+          One-Time Filter: tattle(9, 8)
+ (4 rows)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ NOTICE:  x = 9, y = 8
+  x | u  
+ ---+----
+  9 |  1
+  9 |  2
+  9 |  3
+  9 | 11
+  9 | 12
+  9 | 13
+ (6 rows)
+ -- although even a stable qual should not be pushed down if it references SRF
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+                         QUERY PLAN                        
+ ----------------------------------------------------------
+  Subquery Scan on ss
+    Output: x, u
+    Filter: tattle(ss.x, ss.u)
+    ->  ProjectSet
+          Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+          ->  Result
+ (6 rows)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+ NOTICE:  x = 9, y = 1
+ NOTICE:  x = 9, y = 2
+ NOTICE:  x = 9, y = 3
+ NOTICE:  x = 9, y = 11
+ NOTICE:  x = 9, y = 12
+ NOTICE:  x = 9, y = 13
+  x | u 
+ ---+---
+  9 | 1
+  9 | 2
+  9 | 3
+ (3 rows)
+ drop function tattle(x int, y int);
Simple merge
Simple merge
index 7751fdaa5a25caec1bff08a04e00454a4d23ac91,5c4edd1c16645b30daf530e8c8f05839dc46d2ee..7e56416dc76c7f57dc365581c94ed1dd05e534c3
@@@ -363,16 -385,16 +388,17 @@@ SELECT q1 FROM int8_tbl INTERSECT (((SE
   4567890123456789
  (2 rows)
  
 -(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl ORDER BY 1))) UNION ALL SELECT q2 FROM int8_tbl;
 +(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl ORDER BY 1;
          q1         
  -------------------
 + -4567890123456789
 +               123
                 123
+   4567890123456789
                 456
    4567890123456789
 -               123
    4567890123456789
 - -4567890123456789
 +  4567890123456789
  (7 rows)
  
  SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index caf598689073131d78904a7e9a013464cab7222d,aaedf5f248665405fab2862fde6a4b57f0e72809..d5ef1715f173f9ba4e651acb95744fe4a8abb171
@@@ -101,9 -241,11 +241,10 @@@ NOTICE:  no matching relations in table
  -- Should succeed
  DROP TABLESPACE regress_tblspace_renamed;
  DROP SCHEMA testschema CASCADE;
 -NOTICE:  drop cascades to 5 other objects
 +NOTICE:  drop cascades to 3 other objects
  DETAIL:  drop cascades to table testschema.foo
  drop cascades to table testschema.asselect
 -drop cascades to table testschema.asexecute
  drop cascades to table testschema.atable
+ drop cascades to table testschema.tablespace_acl
  DROP ROLE regress_tablespace_user1;
  DROP ROLE regress_tablespace_user2;
index d1a33b97a6aa7881c3abe8352bcf7ddce161720e,1f8f0987e380f856ae1ae5ee951c5e20fe314052..4e8f4c7bdd6b25bc200e9eceff12ef29c2e8472b
@@@ -113,8 -109,13 +119,13 @@@ test: rangefunc
  # NB: temp.sql does a reconnect which transiently uses 2 connections,
  # so keep this parallel group to at most 19 tests
  # ----------
 -test: plancache limit plpgsql copy2 temp domain rangefuncs prepare without_oid conversion truncate alter_table sequence polymorphism rowtypes returning largeobject with xml
 +test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion truncate alter_table sequence polymorphism rowtypes returning largeobject with xml
  
+ # ----------
+ # Another group of parallel tests
+ # ----------
+ test: identity
  # event triggers cannot run concurrently with any test that runs DDL
  test: event_trigger
  
index 8e190bc73b73c047671febb727152f4c5398e370,b685aeb1fa6922f654fa89a0db033a3dba59e81d..72899c1ae6052cbe2e5fea668121680bf11709dc
@@@ -3050,16 -2245,8 +3080,16 @@@ regression_main(int argc, char *argv[]
  
                /* initdb */
                header(_("initializing database system"));
 +#ifdef PGXC
 +              /* Initialize nodes and GTM */
 +              initdb_node(PGXC_GTM);
 +              initdb_node(PGXC_COORD_1);
 +              initdb_node(PGXC_COORD_2);
 +              initdb_node(PGXC_DATANODE_1);
 +              initdb_node(PGXC_DATANODE_2);
 +#else
                snprintf(buf, sizeof(buf),
-                                "\"%s%sinitdb\" -D \"%s/data\" --noclean --nosync%s%s > \"%s/log/initdb.log\" 2>&1",
+                                "\"%s%sinitdb\" -D \"%s/data\" --no-clean --no-sync%s%s > \"%s/log/initdb.log\" 2>&1",
                                 bindir ? bindir : "",
                                 bindir ? "/" : "",
                                 temp_instance,
Simple merge
Simple merge
Simple merge
index 0b52fe9ad8e53384f83bb33ec2c2e5793851cdef,25dd4e2c6dedd0ea89ad1ecdaf9fd3e9184bcd4d..943e755c2a982a30ab498277c6b6058bb2364df7
@@@ -338,14 -372,11 +378,16 @@@ select 33 = all (null::int[])
  select null::int = all ('{1,2,3}');
  select 33 = all ('{1,null,3}');
  select 33 = all ('{33,null,33}');
+ -- nulls later in the bitmap
+ SELECT -1 != ALL(ARRAY(SELECT NULLIF(g.i, 900) FROM generate_series(1,1000) g(i)));
  
  -- test indexes on arrays
 -create temp table arr_tbl (f1 int[] unique);
 +-- PGXCTODO: related to feature request 3520520, this distribution type is changed
 +-- to replication. As integer arrays are no available distribution types, this table
 +-- should use roundrobin distribution if nothing is specified but roundrobin
 +-- distribution cannot be safely used to check constraints on remote nodes.
 +-- When global constraints are supported, this replication distribution should be removed.
 +create temp table arr_tbl (f1 int[] unique) distribute by replication;
  insert into arr_tbl values ('{1,2,3}');
  insert into arr_tbl values ('{1,2}');
  -- failure expected:
Simple merge
Simple merge
Simple merge
index 709ca4d5b00f73e103ff6a9778cd960ff820adac,4faea36f41a45906c49a9bebfe73ff81e3d3ad8e..c2cdee72351049ac4e0f4b491f475484022e033d
@@@ -90,4 -90,22 +90,22 @@@ SELECT ctid,cmin,* FROM combocidtest OR
  
  COMMIT;
  
 -SELECT ctid,cmin,* FROM combocidtest;
 +SELECT ctid,cmin,* FROM combocidtest ORDER BY ctid;
+ -- test for bug reported in
+ -- CABRT9RC81YUf1=jsmWopcKJEro=VoeG2ou6sPwyOUTx_qteRsg@mail.gmail.com
+ CREATE TABLE IF NOT EXISTS testcase(
+       id int PRIMARY KEY,
+       balance numeric
+ );
+ INSERT INTO testcase VALUES (1, 0);
+ BEGIN;
+ SELECT * FROM testcase WHERE testcase.id = 1 FOR UPDATE;
+ UPDATE testcase SET balance = balance + 400 WHERE id=1;
+ SAVEPOINT subxact;
+ UPDATE testcase SET balance = balance - 100 WHERE id=1;
+ ROLLBACK TO SAVEPOINT subxact;
+ -- should return one tuple
+ SELECT * FROM testcase WHERE id = 1 FOR UPDATE;
+ ROLLBACK;
+ DROP TABLE testcase;
Simple merge
Simple merge
Simple merge
index 3069b2bd066cc1a3befb942f90cfbdcdbcf467cb,557040bbe7d374883300775fdff22b8e31743931..070794fd5e6568c8dc0f9f7626c905bad6103eee
@@@ -37,7 -37,21 +37,21 @@@ INSERT INTO inhg VALUES ('x', 'foo',  '
  SELECT * FROM inhg; /* Two records with three columns in order x=x, xx=text, y=y */
  DROP TABLE inhg;
  
 -CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
+ CREATE TABLE test_like_id_1 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ \d test_like_id_1
+ INSERT INTO test_like_id_1 (b) VALUES ('b1');
+ SELECT * FROM test_like_id_1;
+ CREATE TABLE test_like_id_2 (LIKE test_like_id_1);
+ \d test_like_id_2
+ INSERT INTO test_like_id_2 (b) VALUES ('b2');
+ SELECT * FROM test_like_id_2;  -- identity was not copied
+ CREATE TABLE test_like_id_3 (LIKE test_like_id_1 INCLUDING IDENTITY);
+ \d test_like_id_3
+ INSERT INTO test_like_id_3 (b) VALUES ('b3');
+ SELECT * FROM test_like_id_3;  -- identity was copied and applied
+ DROP TABLE test_like_id_1, test_like_id_2, test_like_id_3;
 +CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text) DISTRIBUTE BY REPLICATION; /* copies indexes */
  INSERT INTO inhg VALUES (5, 10);
  INSERT INTO inhg VALUES (20, 10); -- should fail
  DROP TABLE inhg;
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index b708b5d5ec095e5d9f488650d3cf2b96c9dde61e,5f19dad03cd7c3122268345ef1480988a27e33f0..f8f0b566fb3f6f249376de33b29a67d4be3a9bb8
@@@ -1019,7 -1020,38 +1019,42 @@@ create rule r1 as on delete to t1 do de
  explain (costs off) delete from t1 where a = 1;
  delete from t1 where a = 1;
  
 +drop rule r1 on t1;
 +
 +explain (costs off, nodes off) delete from t1 where a = 1;
 +delete from t1 where a = 1;
+ --
+ -- Test deferred FK check on a tuple deleted by a rolled-back subtransaction
+ --
+ create table pktable2(f1 int primary key);
+ create table fktable2(f1 int references pktable2 deferrable initially deferred);
+ insert into pktable2 values(1);
+ begin;
+ insert into fktable2 values(1);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit;
+ begin;
+ insert into fktable2 values(2);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit; -- fail
+ --
+ -- Test that we prevent dropping FK constraint with pending trigger events
+ --
+ begin;
+ insert into fktable2 values(2);
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ commit;
+ begin;
+ delete from pktable2 where f1 = 1;
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ commit;
+ drop table pktable2, fktable2;
Simple merge
index e3b8b3bc9ab483416c1983f3dea9caccad1f6dec,880e115360d98356add13e2949d428ed4ddee035..ca6dcc416ad9b32fc93c3895bc268299234cb3fa
@@@ -93,13 -93,36 +93,36 @@@ SELECT i FROM inet_tbl WHERE i << '192.
  SET enable_seqscan TO on;
  DROP INDEX inet_idx2;
  
+ -- check that spgist index works correctly
+ CREATE INDEX inet_idx3 ON inet_tbl using spgist (i);
+ SET enable_seqscan TO off;
+ SELECT * FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <<= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i && '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >>= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >> '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i < '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i = '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i > '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <> '192.168.1.0/24'::cidr ORDER BY i;
+ -- test index-only scans
+ EXPLAIN (COSTS OFF)
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ SET enable_seqscan TO on;
+ DROP INDEX inet_idx3;
  -- simple tests of inet boolean and arithmetic operators
 -SELECT i, ~i AS "~i" FROM inet_tbl;
 -SELECT i, c, i & c AS "and" FROM inet_tbl;
 -SELECT i, c, i | c AS "or" FROM inet_tbl;
 -SELECT i, i + 500 AS "i+500" FROM inet_tbl;
 -SELECT i, i - 500 AS "i-500" FROM inet_tbl;
 -SELECT i, c, i - c AS "minus" FROM inet_tbl;
 +SELECT i, ~i AS "~i" FROM inet_tbl ORDER BY i;
 +SELECT i, c, i & c AS "and" FROM inet_tbl ORDER BY i, c;
 +SELECT i, c, i | c AS "or" FROM inet_tbl ORDER BY i, c;
 +SELECT i, i + 500 AS "i+500" FROM inet_tbl ORDER BY i;
 +SELECT i, i - 500 AS "i-500" FROM inet_tbl ORDER BY i;
 +SELECT i, c, i - c AS "minus" FROM inet_tbl ORDER BY i, c;
  SELECT '127.0.0.1'::inet + 257;
  SELECT ('127.0.0.1'::inet + 257) - 257;
  SELECT '127::1'::inet + 257;
index e28632a33e0f860301082001c7627ac7ac6dccb4,70fe971d51f8c03db8a8b00fcf716a3ccf798a75..39e81dc966b2c02c20de120ad6a8837fd36e3b02
@@@ -195,8 -126,36 +195,36 @@@ fro
    ( select f1 from foo union all select f1+3 from foo ) ss
  where bar.f1 = ss.f1;
  
 -select tableoid::regclass::text as relname, bar.* from bar order by 1,2;
 +--select tableoid::regclass::text as relname, bar.* from bar order by 1,2;
  
+ -- Check UPDATE with *partitioned* inherited target and an appendrel subquery
+ create table some_tab (a int);
+ insert into some_tab values (0);
+ create table some_tab_child () inherits (some_tab);
+ insert into some_tab_child values (1);
+ create table parted_tab (a int, b char) partition by list (a);
+ create table parted_tab_part1 partition of parted_tab for values in (1);
+ create table parted_tab_part2 partition of parted_tab for values in (2);
+ create table parted_tab_part3 partition of parted_tab for values in (3);
+ insert into parted_tab values (1, 'a'), (2, 'a'), (3, 'a');
+ update parted_tab set b = 'b'
+ from
+   (select a from some_tab union all select a+1 from some_tab) ss (a)
+ where parted_tab.a = ss.a;
+ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2;
+ truncate parted_tab;
+ insert into parted_tab values (1, 'a'), (2, 'a'), (3, 'a');
+ update parted_tab set b = 'b'
+ from
+   (select 0 from parted_tab union all select 1 from parted_tab) ss (a)
+ where parted_tab.a = ss.a;
+ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2;
+ drop table parted_tab;
+ drop table some_tab cascade;
  /* Test multiple inheritance of column defaults */
  
  CREATE TABLE firstparent (tomorrow date default now()::date + 1);
Simple merge
Simple merge
index a28e44c844d902ec6402645fa41120209f28be0e,c3994ea531ce9a5492709362fd342b9565cf9cb1..83c2b5f5e3685d24703ad2c1de2d4b2f7b6e7447
@@@ -1576,13 -1534,15 +1576,15 @@@ select count(*) from tenk1 a
    tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
  
  -- lateral injecting a strange outer join condition
 -explain (costs off)
 +explain (num_nodes off, nodes off, costs off)
    select * from int8_tbl a,
      int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
-       on x.q2 = ss.z;
+       on x.q2 = ss.z
+   order by a.q1, a.q2, x.q1, x.q2, ss.z;
  select * from int8_tbl a,
    int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
-     on x.q2 = ss.z;
+     on x.q2 = ss.z
+   order by a.q1, a.q2, x.q1, x.q2, ss.z;
  
  -- lateral reference to a join alias variable
  select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
@@@ -1776,12 -1733,148 +1778,158 @@@ delete from xx1 using (select * from in
  delete from xx1 using (select * from int4_tbl where f1 = xx1.x1) ss;
  delete from xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
  
 +-- demonstrate problem with extrememly slow join
 +CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION;
 +INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000);
 +CREATE TABLE testh (a int, b int);
 +INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000);
 +set enable_mergejoin TO false;
 +set enable_hashjoin TO false;
 +EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
 +SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
++
+ --
+ -- test planner's ability to mark joins as unique
+ --
+ create table j1 (id int primary key);
+ create table j2 (id int primary key);
+ create table j3 (id int);
+ insert into j1 values(1),(2),(3);
+ insert into j2 values(1),(2),(3);
+ insert into j3 values(1),(1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure join is properly marked as unique
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id = j2.id;
+ -- ensure join is not unique when not an equi-join
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id > j2.id;
+ -- ensure non-unique rel is not chosen as inner
+ explain (verbose, costs off)
+ select * from j1 inner join j3 on j1.id = j3.id;
+ -- ensure left join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 left join j2 on j1.id = j2.id;
+ -- ensure right join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 right join j2 on j1.id = j2.id;
+ -- ensure full join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 full join j2 on j1.id = j2.id;
+ -- a clauseless (cross) join can't be unique
+ explain (verbose, costs off)
+ select * from j1 cross join j2;
+ -- ensure a natural join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 natural join j2;
+ -- ensure a distinct clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select distinct id from j3) j3 on j1.id = j3.id;
+ -- ensure group by clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- test more complex permutations of unique joins
+ create table j1 (id1 int, id2 int, primary key(id1,id2));
+ create table j2 (id1 int, id2 int, primary key(id1,id2));
+ create table j3 (id1 int, id2 int, primary key(id1,id2));
+ insert into j1 values(1,1),(1,2);
+ insert into j2 values(1,1);
+ insert into j3 values(1,1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure there's no unique join when not all columns which are part of the
+ -- unique index are seen in the join clause
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1;
+ -- ensure proper unique detection with multiple join quals
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
+ -- ensure we don't detect the join to be unique when quals are not part of the
+ -- join condition
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+ -- as above, but for left joins.
+ explain (verbose, costs off)
+ select * from j1
+ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+ -- validate logic in merge joins which skips mark and restore.
+ -- it should only do this if all quals which were used to detect the unique
+ -- are present as join quals, and not plain quals.
+ set enable_nestloop to 0;
+ set enable_hashjoin to 0;
+ set enable_sort to 0;
+ -- create an index that will be preferred over the PK to perform the join
+ create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
+ explain (costs off) select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+ select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+ reset enable_nestloop;
+ reset enable_hashjoin;
+ reset enable_sort;
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- check that semijoin inner is not seen as unique for a portion of the outerrel
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from tenk1 t3
+               where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+ -- ... unless it actually is unique
+ create table j3 as select unique1, tenthous from onek;
+ vacuum analyze j3;
+ create unique index on j3(unique1, tenthous);
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from j3
+               where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
+       and t1.unique1 < 1;
+ drop table j3;
Simple merge
Simple merge
Simple merge
index 1aad4f54b019abbf80b2e0a268cdc6f9cb1363c3,37b9ecce1fc4fea57bfc79d41ef99e7d80892d9a..2e96592e1a4e95100752fe00dc4edd8a3dc830d3
@@@ -63,26 -115,13 +115,29 @@@ SELECT 12345678901234567::money
  SELECT (-12345)::money;
  SELECT (-1234567890)::money;
  SELECT (-12345678901234567)::money;
- SELECT (-123456789012345678)::money;
- SELECT (-9223372036854775808)::money;
  SELECT 1234567890::int4::money;
  SELECT 12345678901234567::int8::money;
+ SELECT 12345678901234567::numeric::money;
  SELECT (-1234567890)::int4::money;
  SELECT (-12345678901234567)::int8::money;
 +
 +INSERT INTO money_data VALUES ('$223.459');
 +INSERT INTO money_data VALUES ('$323.459');
 +INSERT INTO money_data VALUES ('$423.459');
 +INSERT INTO money_data VALUES ('$523.459');
 +SELECT sum(m) FROM money_data;
 +
 +CREATE TABLE money_data2 (a int, m money);
 +INSERT INTO money_data2 VALUES (1, '$123.459');
 +INSERT INTO money_data2 VALUES (2, '$223.459');
 +INSERT INTO money_data2 VALUES (3, '$323.459');
 +INSERT INTO money_data2 VALUES (4, '$423.459');
 +INSERT INTO money_data2 VALUES (5, '$523.459');
 +SELECT sum(m) FROM money_data2;
 +DROP TABLE money_data2;
++
+ SELECT (-12345678901234567)::numeric::money;
+ -- Cast from money
+ SELECT '12345678901234567'::money::numeric;
+ SELECT '-12345678901234567'::money::numeric;
Simple merge
Simple merge
index c6c0bac9cae1177911a66790bba5788ad4111d1f,60d1d38e346023a8e30cb6fcefe890588d4a955e..7bc74fbebfdc00f9cac5ce6732b17a2f10cb4760
@@@ -4516,24 -4431,338 +4518,360 @@@ exception when others the
  end;
  $$;
  
 +
 +-- Check parameter handling
 +BEGIN;
 +DROP TABLE IF EXISTS testcase_13;
 +CREATE TABLE testcase_13 (patient_id integer);
 +INSERT INTO testcase_13 VALUES (1);
 +DO $$
 +DECLARE
 + r RECORD;
 +BEGIN
 +FOR r IN SELECT * FROM testcase_13 LOOP
 +    RAISE INFO 'r.patient_id=%', r.patient_id;
 +    IF   (SELECT EXISTS (
 +            SELECT FROM testcase_13 WHERE patient_id = r.patient_id
 +        ))
 +       THEN
 +          RAISE INFO 'condition true';
 +    END IF;
 +  END LOOP;
 +END $$;
 +ROLLBACK;
++
+ -- Test use of plpgsql in a domain check constraint (cf. bug #14414)
+ create function plpgsql_domain_check(val int) returns boolean as $$
+ begin return val > 0; end
+ $$ language plpgsql immutable;
+ create domain plpgsql_domain as integer check(plpgsql_domain_check(value));
+ do $$
+ declare v_test plpgsql_domain;
+ begin
+   v_test := 1;
+ end;
+ $$;
+ do $$
+ declare v_test plpgsql_domain := 1;
+ begin
+   v_test := 0;  -- fail
+ end;
+ $$;
+ -- Test handling of expanded array passed to a domain constraint (bug #14472)
+ create function plpgsql_arr_domain_check(val int[]) returns boolean as $$
+ begin return val[1] > 0; end
+ $$ language plpgsql immutable;
+ create domain plpgsql_arr_domain as int[] check(plpgsql_arr_domain_check(value));
+ do $$
+ declare v_test plpgsql_arr_domain;
+ begin
+   v_test := array[1];
+   v_test := v_test || 2;
+ end;
+ $$;
+ do $$
+ declare v_test plpgsql_arr_domain := array[1];
+ begin
+   v_test := 0 || v_test;  -- fail
+ end;
+ $$;
+ --
+ -- test usage of transition tables in AFTER triggers
+ --
+ CREATE TABLE transition_table_base (id int PRIMARY KEY, val text);
+ CREATE FUNCTION transition_table_base_ins_func()
+   RETURNS trigger
+   LANGUAGE plpgsql
+ AS $$
+ DECLARE
+   t text;
+   l text;
+ BEGIN
+   t = '';
+   FOR l IN EXECUTE
+            $q$
+              EXPLAIN (TIMING off, COSTS off, VERBOSE on)
+              SELECT * FROM newtable
+            $q$ LOOP
+     t = t || l || E'\n';
+   END LOOP;
+   RAISE INFO '%', t;
+   RETURN new;
+ END;
+ $$;
+ CREATE TRIGGER transition_table_base_ins_trig
+   AFTER INSERT ON transition_table_base
+   REFERENCING OLD TABLE AS oldtable NEW TABLE AS newtable
+   FOR EACH STATEMENT
+   EXECUTE PROCEDURE transition_table_base_ins_func();
+ CREATE TRIGGER transition_table_base_ins_trig
+   AFTER INSERT ON transition_table_base
+   REFERENCING NEW TABLE AS newtable
+   FOR EACH STATEMENT
+   EXECUTE PROCEDURE transition_table_base_ins_func();
+ INSERT INTO transition_table_base VALUES (1, 'One'), (2, 'Two');
+ INSERT INTO transition_table_base VALUES (3, 'Three'), (4, 'Four');
+ CREATE OR REPLACE FUNCTION transition_table_base_upd_func()
+   RETURNS trigger
+   LANGUAGE plpgsql
+ AS $$
+ DECLARE
+   t text;
+   l text;
+ BEGIN
+   t = '';
+   FOR l IN EXECUTE
+            $q$
+              EXPLAIN (TIMING off, COSTS off, VERBOSE on)
+              SELECT * FROM oldtable ot FULL JOIN newtable nt USING (id)
+            $q$ LOOP
+     t = t || l || E'\n';
+   END LOOP;
+   RAISE INFO '%', t;
+   RETURN new;
+ END;
+ $$;
+ CREATE TRIGGER transition_table_base_upd_trig
+   AFTER UPDATE ON transition_table_base
+   REFERENCING OLD TABLE AS oldtable NEW TABLE AS newtable
+   FOR EACH STATEMENT
+   EXECUTE PROCEDURE transition_table_base_upd_func();
+ UPDATE transition_table_base
+   SET val = '*' || val || '*'
+   WHERE id BETWEEN 2 AND 3;
+ CREATE TABLE transition_table_level1
+ (
+       level1_no serial NOT NULL ,
+       level1_node_name varchar(255),
+        PRIMARY KEY (level1_no)
+ ) WITHOUT OIDS;
+ CREATE TABLE transition_table_level2
+ (
+       level2_no serial NOT NULL ,
+       parent_no int NOT NULL,
+       level1_node_name varchar(255),
+        PRIMARY KEY (level2_no)
+ ) WITHOUT OIDS;
+ CREATE TABLE transition_table_status
+ (
+       level int NOT NULL,
+       node_no int NOT NULL,
+       status int,
+        PRIMARY KEY (level, node_no)
+ ) WITHOUT OIDS;
+ CREATE FUNCTION transition_table_level1_ri_parent_del_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   DECLARE n bigint;
+   BEGIN
+     PERFORM FROM p JOIN transition_table_level2 c ON c.parent_no = p.level1_no;
+     IF FOUND THEN
+       RAISE EXCEPTION 'RI error';
+     END IF;
+     RETURN NULL;
+   END;
+ $$;
+ CREATE TRIGGER transition_table_level1_ri_parent_del_trigger
+   AFTER DELETE ON transition_table_level1
+   REFERENCING OLD TABLE AS p
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level1_ri_parent_del_func();
+ CREATE FUNCTION transition_table_level1_ri_parent_upd_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   DECLARE
+     x int;
+   BEGIN
+     WITH p AS (SELECT level1_no, sum(delta) cnt
+                  FROM (SELECT level1_no, 1 AS delta FROM i
+                        UNION ALL
+                        SELECT level1_no, -1 AS delta FROM d) w
+                  GROUP BY level1_no
+                  HAVING sum(delta) < 0)
+     SELECT level1_no
+       FROM p JOIN transition_table_level2 c ON c.parent_no = p.level1_no
+       INTO x;
+     IF FOUND THEN
+       RAISE EXCEPTION 'RI error';
+     END IF;
+     RETURN NULL;
+   END;
+ $$;
+ CREATE TRIGGER transition_table_level1_ri_parent_upd_trigger
+   AFTER UPDATE ON transition_table_level1
+   REFERENCING OLD TABLE AS d NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level1_ri_parent_upd_func();
+ CREATE FUNCTION transition_table_level2_ri_child_insupd_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   BEGIN
+     PERFORM FROM i
+       LEFT JOIN transition_table_level1 p
+         ON p.level1_no IS NOT NULL AND p.level1_no = i.parent_no
+       WHERE p.level1_no IS NULL;
+     IF FOUND THEN
+       RAISE EXCEPTION 'RI error';
+     END IF;
+     RETURN NULL;
+   END;
+ $$;
+ CREATE TRIGGER transition_table_level2_ri_child_insupd_trigger
+   AFTER INSERT OR UPDATE ON transition_table_level2
+   REFERENCING NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level2_ri_child_insupd_func();
+ -- create initial test data
+ INSERT INTO transition_table_level1 (level1_no)
+   SELECT generate_series(1,200);
+ ANALYZE transition_table_level1;
+ INSERT INTO transition_table_level2 (level2_no, parent_no)
+   SELECT level2_no, level2_no / 50 + 1 AS parent_no
+     FROM generate_series(1,9999) level2_no;
+ ANALYZE transition_table_level2;
+ INSERT INTO transition_table_status (level, node_no, status)
+   SELECT 1, level1_no, 0 FROM transition_table_level1;
+ INSERT INTO transition_table_status (level, node_no, status)
+   SELECT 2, level2_no, 0 FROM transition_table_level2;
+ ANALYZE transition_table_status;
+ INSERT INTO transition_table_level1(level1_no)
+   SELECT generate_series(201,1000);
+ ANALYZE transition_table_level1;
+ -- behave reasonably if someone tries to modify a transition table
+ CREATE FUNCTION transition_table_level2_bad_usage_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+   BEGIN
+     INSERT INTO d VALUES (1000000, 1000000, 'x');
+     RETURN NULL;
+   END;
+ $$;
+ CREATE TRIGGER transition_table_level2_bad_usage_trigger
+   AFTER DELETE ON transition_table_level2
+   REFERENCING OLD TABLE AS d
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     transition_table_level2_bad_usage_func();
+ DELETE FROM transition_table_level2
+   WHERE level2_no BETWEEN 301 AND 305;
+ DROP TRIGGER transition_table_level2_bad_usage_trigger
+   ON transition_table_level2;
+ -- attempt modifications which would break RI (should all fail)
+ DELETE FROM transition_table_level1
+   WHERE level1_no = 25;
+ UPDATE transition_table_level1 SET level1_no = -1
+   WHERE level1_no = 30;
+ INSERT INTO transition_table_level2 (level2_no, parent_no)
+   VALUES (10000, 10000);
+ UPDATE transition_table_level2 SET parent_no = 2000
+   WHERE level2_no = 40;
+ -- attempt modifications which would not break RI (should all succeed)
+ DELETE FROM transition_table_level1
+   WHERE level1_no BETWEEN 201 AND 1000;
+ DELETE FROM transition_table_level1
+   WHERE level1_no BETWEEN 100000000 AND 100000010;
+ SELECT count(*) FROM transition_table_level1;
+ DELETE FROM transition_table_level2
+   WHERE level2_no BETWEEN 211 AND 220;
+ SELECT count(*) FROM transition_table_level2;
+ CREATE TABLE alter_table_under_transition_tables
+ (
+   id int PRIMARY KEY,
+   name text
+ );
+ CREATE FUNCTION alter_table_under_transition_tables_upd_func()
+   RETURNS TRIGGER
+   LANGUAGE plpgsql
+ AS $$
+ BEGIN
+   RAISE WARNING 'old table = %, new table = %',
+                   (SELECT string_agg(id || '=' || name, ',') FROM d),
+                   (SELECT string_agg(id || '=' || name, ',') FROM i);
+   RAISE NOTICE 'one = %', (SELECT 1 FROM alter_table_under_transition_tables LIMIT 1);
+   RETURN NULL;
+ END;
+ $$;
+ -- should fail, TRUNCATE is not compatible with transition tables
+ CREATE TRIGGER alter_table_under_transition_tables_upd_trigger
+   AFTER TRUNCATE OR UPDATE ON alter_table_under_transition_tables
+   REFERENCING OLD TABLE AS d NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     alter_table_under_transition_tables_upd_func();
+ -- should work
+ CREATE TRIGGER alter_table_under_transition_tables_upd_trigger
+   AFTER UPDATE ON alter_table_under_transition_tables
+   REFERENCING OLD TABLE AS d NEW TABLE AS i
+   FOR EACH STATEMENT EXECUTE PROCEDURE
+     alter_table_under_transition_tables_upd_func();
+ INSERT INTO alter_table_under_transition_tables
+   VALUES (1, '1'), (2, '2'), (3, '3');
+ UPDATE alter_table_under_transition_tables
+   SET name = name || name;
+ -- now change 'name' to an integer to see what happens...
+ ALTER TABLE alter_table_under_transition_tables
+   ALTER COLUMN name TYPE int USING name::integer;
+ UPDATE alter_table_under_transition_tables
+   SET name = (name::text || name::text)::integer;
+ -- now drop column 'name'
+ ALTER TABLE alter_table_under_transition_tables
+   DROP column name;
+ UPDATE alter_table_under_transition_tables
+   SET id = id;
Simple merge
index fe8357d9d95e1683060f8429cab1189af6494893,fe83709e1b6e0351c09ddd020f386145985040de..e2222bdc6654d937a1c7e265e1bda631439c4b1c
@@@ -124,9 -124,70 +124,70 @@@ SET SESSION AUTHORIZATION regress_user4
  COPY atest2 FROM stdin; -- ok
  bar   true
  \.
 -SELECT * FROM atest1; -- ok
 +SELECT * FROM atest1 ORDER BY 1; -- ok
  
  
+ -- test leaky-function protections in selfuncs
+ -- regress_user1 will own a table and provide a view for it.
+ SET SESSION AUTHORIZATION regress_user1;
+ CREATE TABLE atest12 as
+   SELECT x AS a, 10001 - x AS b FROM generate_series(1,10000) x;
+ CREATE INDEX ON atest12 (a);
+ CREATE INDEX ON atest12 (abs(a));
+ VACUUM ANALYZE atest12;
+ CREATE FUNCTION leak(integer,integer) RETURNS boolean
+   AS $$begin return $1 < $2; end$$
+   LANGUAGE plpgsql immutable;
+ CREATE OPERATOR <<< (procedure = leak, leftarg = integer, rightarg = integer,
+                      restrict = scalarltsel);
+ -- view with leaky operator
+ CREATE VIEW atest12v AS
+   SELECT * FROM atest12 WHERE b <<< 5;
+ GRANT SELECT ON atest12v TO PUBLIC;
+ -- This plan should use nestloop, knowing that few rows will be selected.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+ -- And this one.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y
+   WHERE x.a = y.b and abs(y.a) <<< 5;
+ -- Check if regress_user2 can break security.
+ SET SESSION AUTHORIZATION regress_user2;
+ CREATE FUNCTION leak2(integer,integer) RETURNS boolean
+   AS $$begin raise notice 'leak % %', $1, $2; return $1 > $2; end$$
+   LANGUAGE plpgsql immutable;
+ CREATE OPERATOR >>> (procedure = leak2, leftarg = integer, rightarg = integer,
+                      restrict = scalargtsel);
+ -- This should not show any "leak" notices before failing.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 WHERE a >>> 0;
+ -- This plan should use hashjoin, as it will expect many rows to be selected.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+ -- Now regress_user1 grants sufficient access to regress_user2.
+ SET SESSION AUTHORIZATION regress_user1;
+ GRANT SELECT (a, b) ON atest12 TO PUBLIC;
+ SET SESSION AUTHORIZATION regress_user2;
+ -- Now regress_user2 will also get a good row estimate.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+ -- But not for this, due to lack of table-wide permissions needed
+ -- to make use of the expression index's statistics.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y
+   WHERE x.a = y.b and abs(y.a) <<< 5;
+ -- clean up (regress_user1's objects are all dropped later)
+ DROP FUNCTION leak2(integer, integer) CASCADE;
  -- groups
  
  SET SESSION AUTHORIZATION regress_user3;
Simple merge
Simple merge
Simple merge
index 449f28064be7d18f2fa37ff739b9a941972e6d1c,aada114ab2b821839feeb404e94336c5db66c1b1..742134e97170f0b9471bfbbd78ab45826baad81c
@@@ -523,10 -522,10 +523,10 @@@ CREATE TABLE shoe_data 
        shoename   char(10),      -- primary key
        sh_avail   integer,       -- available # of pairs
        slcolor    char(10),      -- preferred shoelace color
-       slminlen   float,         -- miminum shoelace length
+       slminlen   float,         -- minimum shoelace length
        slmaxlen   float,         -- maximum shoelace length
        slunit     char(8)        -- length unit
 -);
 +) distribute by roundrobin;
  
  CREATE TABLE shoelace_data (
        sl_name    char(10),      -- primary key
Simple merge
index a27b0faade0aeb23771795ba07c8af7a39aaf701,e742f136990b9b79e8cec83947ea08afdd8c5f9d..1b175469125a3c54a992e5638450f6f683c096a9
@@@ -3,9 -3,9 +3,9 @@@
  -- test the views defined in CREATE_VIEWS
  --
  
 -SELECT * FROM street;
 +SELECT * FROM street ORDER BY name,cname,thepath::text;
  
- SELECT name, #thepath FROM iexit ORDER BY 1, 2;
+ SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2;
  
  SELECT * FROM toyemp WHERE name = 'sharon';
  
index b55bf5857e6003403c6e327f5ad30f3fb0b3a902,d53e33d779479d0f0a6a1e1d85915cd81ab83e6b..769760de3c276313320c74881d0580b57a967d86
@@@ -1,15 -51,15 +51,15 @@@ ALTER SEQUENCE sequence_test14 AS int
  ---
  --- test creation of SERIAL column
  ---
 -
 +SET sequence_range = 1;
- CREATE TABLE serialTest (f1 text, f2 serial);
+ CREATE TABLE serialTest1 (f1 text, f2 serial);
  
- INSERT INTO serialTest VALUES ('foo');
- INSERT INTO serialTest VALUES ('bar');
- INSERT INTO serialTest VALUES ('force', 100);
- INSERT INTO serialTest VALUES ('wrong', NULL);
+ INSERT INTO serialTest1 VALUES ('foo');
+ INSERT INTO serialTest1 VALUES ('bar');
+ INSERT INTO serialTest1 VALUES ('force', 100);
+ INSERT INTO serialTest1 VALUES ('wrong', NULL);
  
- SELECT * FROM serialTest ORDER BY f1, f2;
 -SELECT * FROM serialTest1;
++SELECT * FROM serialTest1 ORDER BY f1, f2;
  
  -- test smallserial / bigserial
  CREATE TABLE serialTest2 (f1 text, f2 serial, f3 smallserial, f4 serial2,
@@@ -88,9 -140,9 +140,9 @@@ SELECT last_value, log_cnt IN (31, 32) 
  DROP SEQUENCE foo_seq_new;
  
  -- renaming serial sequences
- ALTER TABLE serialtest_f2_seq RENAME TO serialtest_f2_foo;
- INSERT INTO serialTest VALUES ('more');
- SELECT * FROM serialTest ORDER BY f1, f2;
+ ALTER TABLE serialtest1_f2_seq RENAME TO serialtest1_f2_foo;
+ INSERT INTO serialTest1 VALUES ('more');
 -SELECT * FROM serialTest1;
++SELECT * FROM serialTest1 ORDER BY f1, f2;
  
  --
  -- Check dependencies of serial and ordinary sequences
@@@ -264,10 -393,10 +394,18 @@@ SELECT * FROM information_schema.sequen
  DROP USER regress_seq_user;
  DROP SEQUENCE seq;
  
 +create table test_seqtab (unique1 int, unique2 int);
 +insert into test_seqtab select i, i from generate_series(1,1000) s(i);
 +
 +create temp sequence testseq;
 +select distinct(nextval('testseq'))
 +  from test_seqtab order by 1 limit 10;
 +drop table test_seqtab;
++
+ -- cache tests
+ CREATE SEQUENCE test_seq1 CACHE 10;
+ SELECT nextval('test_seq1');
+ SELECT nextval('test_seq1');
+ SELECT nextval('test_seq1');
+ DROP SEQUENCE test_seq1;
index ce4019df9f4c96df0aa5f550dee50c21318193d6,2fc0e26ca066a97c67aea259231b8f5bf7d7cb19..2aeee3a54423acf9d64b0995ccdd16bc2ca02ae3
@@@ -495,48 -497,46 +510,91 @@@ select * fro
  
  select nextval('ts1');
  
 +SELECT setseed(0);
 +
 +-- DROP TABLE IF EXISTS asd ;
 +
 +CREATE TABLE IF NOT EXISTS asd  AS
 +SELECT clientid::numeric(20),
 + (clientid / 20 )::integer::numeric(20) as userid,
 + cts + ((random()* 3600 *24 )||'sec')::interval as cts,
 + (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state,
 + 0 as dim,
 + ((ARRAY['Cat','Dog','Duck'])[(clientid / 10  )% 3  +1 ]) ::text as app_name,
 + ((ARRAY['A','B'])[(clientid / 10  )% 2  +1 ]) ::text as platform
 + FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t
 +;
 +
 +SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid,
 +      B.state as state
 +FROM ( VALUES
 +('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') ,
 +('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03')
 +) AS D (dates)
 +JOIN
 +( SELECT DISTINCT clientid FROM asd
 +      WHERE userid=74 ) C ON True
 +INNER JOIN LATERAL (
 +      SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.*
 +      FROM asd x
 +      INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim ,
 +           MAX(p.cts) AS selected_cts
 +              FROM asd p
 +              where cts<D.dates::timestamp and state in
 +              ('A','B')
 +      GROUP BY p.clientid,p.app_name,p.platform,p.state,p.dim) y
 +      ON y.clientid = x.clientid
 +      AND y.selected_cts = x.cts
 +      AND y.platform = x.platform
 +      AND y.app_name=x.app_name
 +      AND y.state=x.state
 +      AND y.dim = x.dim
 +      and x.clientid = C.clientid
 +) B ON True
 +ORDER BY dates desc, state;
 +
 +DROP TABLE asd;
 +SELECT setseed(0);
+ --
+ -- Check that volatile quals aren't pushed down past a set-returning function;
+ -- while a nonvolatile qual can be, if it doesn't reference the SRF.
+ --
+ create function tattle(x int, y int) returns bool
+ volatile language plpgsql as $$
+ begin
+   raise notice 'x = %, y = %', x, y;
+   return x > y;
+ end$$;
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ -- if we pretend it's stable, we get different results:
+ alter function tattle(x int, y int) stable;
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, 8);
+ -- although even a stable qual should not be pushed down if it references SRF
+ explain (verbose, costs off)
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+ select * from
+   (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+   where tattle(x, u);
+ drop function tattle(x int, y int);
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 1f353632be50ae2e8911d5e5a89ce43c6f14f484,663711997b00952e1e9f0f3e7e54ac1d77f100a1..15bee1f50720efeb2f662896e6a65a4f41905123
@@@ -38,8 -38,12 +38,12 @@@ SELECT * FROM update_test  ORDER BY a, 
  UPDATE update_test SET a=v.i FROM (VALUES(100, 20)) AS v(i, j)
    WHERE update_test.b = v.j;
  
 -SELECT * FROM update_test;
 +SELECT * FROM update_test  ORDER BY a, b, c;
  
+ -- fail, wrong data type:
+ UPDATE update_test SET a = v.* FROM (VALUES(100, 20)) AS v(i, j)
+   WHERE update_test.b = v.j;
  --
  -- Test multiple-set-clause syntax
  --
Simple merge
index 11988acab52a86037a9f272fa868dd15ee840113,8ae5184d0f17cc4c2e805aad0280ae90134fe41f..e7db4f0e7bc2495a04b8a220856b4803a7fa45a5
@@@ -76,8 -76,17 +76,17 @@@ WITH RECURSIVE t(n) AS 
  UNION ALL
      SELECT n || ' bar' FROM t WHERE length(n) < 20
  )
 -SELECT n, n IS OF (text) AS is_text FROM t;
 +SELECT n, n IS OF (text) as is_text FROM t ORDER BY n;
  
+ -- In a perfect world, this would work and resolve the literal as int ...
+ -- but for now, we have to be content with resolving to text too soon.
+ WITH RECURSIVE t(n) AS (
+     SELECT '7'
+ UNION ALL
+     SELECT n+1 FROM t WHERE n < 10
+ )
+ SELECT n, n IS OF (int) AS is_int FROM t;
  --
  -- Some examples with a tree
  --
Simple merge