#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
- # Generated by GNU Autoconf 2.69 for PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1).
-# Generated by GNU Autoconf 2.69 for PostgreSQL 9.5beta1.
++# Generated by GNU Autoconf 2.69 for PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1).
#
-# Report bugs to <pgsql-bugs@postgresql.org>.
+# Report bugs to <bugs@postgres-xl.org>.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
# Identity of this package.
PACKAGE_NAME='PostgreSQL'
PACKAGE_TARNAME='postgresql'
- PACKAGE_VERSION='9.5alpha1 (Postgres-XL 9.5alpha1)'
- PACKAGE_XC_VERSION='9.5alpha1'
- PACKAGE_STRING='PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1)'
-PACKAGE_VERSION='9.5beta1'
-PACKAGE_STRING='PostgreSQL 9.5beta1'
++PACKAGE_VERSION='9.5beta1 (Postgres-XL 9.5beta1)'
++PACKAGE_XC_VERSION='9.5beta1'
++PACKAGE_STRING='PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1)'
PACKAGE_URL=''
ac_unique_file="src/backend/access/common/heaptuple.c"
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
- \`configure' configures PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1) to adapt to many kinds of systems.
-\`configure' configures PostgreSQL 9.5beta1 to adapt to many kinds of systems.
++\`configure' configures PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1) to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1):";;
- short | recursive ) echo "Configuration of PostgreSQL 9.5beta1:";;
++ short | recursive ) echo "Configuration of PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1):";;
esac
cat <<\_ACEOF
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
- PostgreSQL configure 9.5alpha1 (Postgres-XL 9.5alpha1)
-PostgreSQL configure 9.5beta1
++PostgreSQL configure 9.5beta1 (Postgres-XL 9.5beta1)
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
- It was created by PostgreSQL $as_me 9.5alpha1 (Postgres-XL 9.5alpha1), which was
-It was created by PostgreSQL $as_me 9.5beta1, which was
++It was created by PostgreSQL $as_me 9.5beta1 (Postgres-XL 9.5beta1), which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
_ACEOF
+# For PGXC, set -DPGXC by default. This can be overriden with -UPGXC if the user sets it.
+# For Postgres-XL, set both -DPGXC and -DXCP
+CFLAGS="-DPGXC -DXCP $CFLAGS"
+
# Begin output steps
{ $as_echo "$as_me:${as_lineno-$LINENO}: using compiler=$cc_string" >&5
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
- This file was extended by PostgreSQL $as_me 9.5alpha1 (Postgres-XL 9.5alpha1), which was
-This file was extended by PostgreSQL $as_me 9.5beta1, which was
++This file was extended by PostgreSQL $as_me 9.5beta1 (Postgres-XL 9.5beta1), which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
- PostgreSQL config.status 9.5alpha1 (Postgres-XL 9.5alpha1)
-PostgreSQL config.status 9.5beta1
++PostgreSQL config.status 9.5beta1 (Postgres-XL 9.5beta1)
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
dnl
m4_pattern_forbid(^PGAC_)dnl to catch undefined macros
m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
Untested combinations of 'autoconf' and PostgreSQL versions are not
Operating System (example: Linux 2.4.18) :
- PostgreSQL version (example: PostgreSQL 9.5alpha1): Postgres-XL 9.5alpha1
- PostgreSQL version (example: PostgreSQL 9.5beta1): PostgreSQL 9.5beta1
++ PostgreSQL version (example: PostgreSQL 9.5beta1): Postgres-XL 9.5beta1
Compiler used (example: gcc 3.3.5) :
pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
pg_ts_parser.h pg_ts_template.h pg_extension.h \
pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
+ pgxc_class.h pgxc_node.h pgxc_group.h \
pg_foreign_table.h pg_policy.h pg_replication_origin.h \
- pg_tablesample_method.h pg_default_acl.h pg_seclabel.h pg_shseclabel.h \
- pg_collation.h pg_range.h pg_transform.h toasting.h indexing.h \
+ pg_default_acl.h pg_seclabel.h pg_shseclabel.h \
+ pg_collation.h pg_range.h pg_transform.h \
+ toasting.h indexing.h \
)
# location of Catalog.pm
UserMappingRelationId, /* OCLASS_USER_MAPPING */
DefaultAclRelationId, /* OCLASS_DEFACL */
ExtensionRelationId, /* OCLASS_EXTENSION */
+#ifdef PGXC
+ PgxcClassRelationId, /* OCLASS_PGXCCLASS */
+#endif
EventTriggerRelationId, /* OCLASS_EVENT_TRIGGER */
- PolicyRelationId /* OCLASS_POLICY */
+ PolicyRelationId, /* OCLASS_POLICY */
+ TransformRelationId /* OCLASS_TRANSFORM */
};
case OCLASS_USER_MAPPING:
case OCLASS_DEFACL:
case OCLASS_EXTENSION:
- case OCLASS_PGXC_NODE:
- case OCLASS_PGXC_GROUP:
+#ifdef PGXC
+ case OCLASS_PGXC_CLASS:
+#endif
case OCLASS_POLICY:
return true;
-
- case MAX_OCLASS:
-
- /*
- * This shouldn't ever happen, but we keep the case to avoid a
- * compiler warning without a "default" clause in the switch.
- */
- Assert(false);
- break;
}
return true;
if (((Scan *) plan)->scanrelid > 0)
ExplainScanTarget((Scan *) plan, es);
break;
- case T_SampleScan:
- ExplainScanTarget((Scan *) plan, es);
- break;
+#ifdef PGXC
+ case T_RemoteQuery:
+ /* Emit node execution list */
+ ExplainExecNodes(((RemoteQuery *)plan)->exec_nodes, es);
+ ExplainScanTarget((Scan *) plan, es);
+ break;
+#endif
+#ifdef XCP
+ case T_RemoteSubplan:
+ {
+ RemoteSubplan *rsubplan = (RemoteSubplan *) plan;
+ List *nodeNameList = NIL;
+ ListCell *lc;
+
+ foreach(lc, rsubplan->nodeList)
+ {
+ char *nodename = get_pgxc_nodename(
+ PGXCNodeGetNodeOid(lfirst_int(lc),
+ PGXC_NODE_DATANODE));
+ nodeNameList = lappend(nodeNameList, nodename);
+ }
+
+ /* print out destination nodes */
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ if (nodeNameList)
+ {
+ if (es->nodes)
+ {
+ bool first = true;
+ ListCell *lc;
+ foreach(lc, nodeNameList)
+ {
+ char *nodename = (char *) lfirst(lc);
+ if (first)
+ {
+ appendStringInfo(es->str, " on %s (%s",
+ rsubplan->execOnAll ? "all" : "any",
+ nodename);
+ first = false;
+ }
+ else
+ appendStringInfo(es->str, ",%s", nodename);
+ }
+ appendStringInfoChar(es->str, ')');
+ }
+ else
+ {
+ appendStringInfo(es->str, " on %s",
+ rsubplan->execOnAll ? "all" : "any");
+ }
+ }
+ else
+ {
+ appendStringInfo(es->str, " on local node");
+ }
+ }
+ else
+ {
+ ExplainPropertyText("Replicated",
+ rsubplan->execOnAll ? "no" : "yes",
+ es);
+ ExplainPropertyList("Node List", nodeNameList, es);
+ }
+ }
+ break;
+#endif /* XCP */
case T_IndexScan:
{
IndexScan *indexscan = (IndexScan *) plan;
static void ATExecDropOf(Relation rel, LOCKMODE lockmode);
static void ATExecReplicaIdentity(Relation rel, ReplicaIdentityStmt *stmt, LOCKMODE lockmode);
static void ATExecGenericOptions(Relation rel, List *options);
+#ifdef PGXC
+static void AtExecDistributeBy(Relation rel, DistributeBy *options);
+static void AtExecSubCluster(Relation rel, PGXCSubCluster *options);
+static void AtExecAddNode(Relation rel, List *options);
+static void AtExecDeleteNode(Relation rel, List *options);
+static void ATCheckCmd(Relation rel, AlterTableCmd *cmd);
+static RedistribState *BuildRedistribCommands(Oid relid, List *subCmds);
+static Oid *delete_node_list(Oid *old_oids, int old_num, Oid *del_oids, int del_num, int *new_num);
+static Oid *add_node_list(Oid *old_oids, int old_num, Oid *add_oids, int add_num, int *new_num);
+#endif
static void ATExecEnableRowSecurity(Relation rel);
static void ATExecDisableRowSecurity(Relation rel);
+ static void ATExecForceNoForceRowSecurity(Relation rel, bool force_rls);
static void copy_relation_data(SMgrRelation rel, SMgrRelation dst,
ForkNumber forkNum, char relpersistence);
switch (rte->rtekind)
{
case RTE_RELATION:
- if (rte->tablesample)
- {
- CHECKFLATCOPY(newrte->tablesample, rte->tablesample,
- TableSampleClause);
- MUTATE(newrte->tablesample->args,
- newrte->tablesample->args,
- List *);
- MUTATE(newrte->tablesample->repeatable,
- newrte->tablesample->repeatable,
- Node *);
- }
+ MUTATE(newrte->tablesample, rte->tablesample,
+ TableSampleClause *);
+ /* we don't bother to copy eref, aliases, etc; OK? */
break;
case RTE_CTE:
- #endif /* PGXC */
- /* we don't bother to copy eref, aliases, etc; OK? */
+ /* nothing to do */
+ break;
+#ifdef PGXC
+ case RTE_REMOTE_DUMMY:
++ /* nothing to do */
+ break;
++#endif /* PGXC */
case RTE_SUBQUERY:
if (!(flags & QTW_IGNORE_RT_SUBQUERIES))
{
WRITE_NODE_FIELD(rowMarks);
WRITE_INT_FIELD(epqParam);
WRITE_ENUM_FIELD(onConflictAction, OnConflictAction);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_LIST_FIELD(arbiterIndexes);
+ else
+ {
+#endif
WRITE_NODE_FIELD(arbiterIndexes);
+#ifdef XCP
+ }
+#endif
WRITE_NODE_FIELD(onConflictSet);
WRITE_NODE_FIELD(onConflictWhere);
- WRITE_INT_FIELD(exclRelRTI);
+ WRITE_UINT_FIELD(exclRelRTI);
WRITE_NODE_FIELD(exclRelTlist);
}
WRITE_BITMAPSET_FIELD(funcparams);
}
+ static void
+ _outTableSampleClause(StringInfo str, const TableSampleClause *node)
+ {
+ WRITE_NODE_TYPE("TABLESAMPLECLAUSE");
+
++#ifdef XCP
++ if (portable_output)
++ {
++ WRITE_FUNCID_FIELD(tsmhandler);
++ }
++ else
++ {
++#endif
+ WRITE_OID_FIELD(tsmhandler);
++#ifdef XCP
++ }
++#endif
+ WRITE_NODE_FIELD(args);
+ WRITE_NODE_FIELD(repeatable);
+ }
+
static void
_outAExpr(StringInfo str, const A_Expr *node)
{
case T_SeqScan:
_outSeqScan(str, obj);
break;
+#ifdef PGXC
+ case T_RemoteQuery:
+ _outRemoteQuery(str, obj);
+ break;
+#endif
+ case T_SampleScan:
+ _outSampleScan(str, obj);
+ break;
case T_IndexScan:
_outIndexScan(str, obj);
break;
READ_DONE();
}
+ /*
+ * _readTableSampleClause
+ */
+ static TableSampleClause *
+ _readTableSampleClause(void)
+ {
+ READ_LOCALS(TableSampleClause);
+
++#ifdef XCP
++ if (portable_input)
++ {
++ READ_FUNCID_FIELD(tsmhandler);
++ }
++ else
++ {
++#endif
+ READ_OID_FIELD(tsmhandler);
++#ifdef XCP
++ }
++#endif
+ READ_NODE_FIELD(args);
+ READ_NODE_FIELD(repeatable);
+
+ READ_DONE();
+ }
+
+#ifdef XCP
/*
- * parseNodeString
- *
- * Given a character string representing a node tree, parseNodeString creates
- * the internal node structure.
- *
- * The string to be read must already have been loaded into pg_strtok().
+ * _readPlan
*/
-Node *
-parseNodeString(void)
+static Plan *
+_readPlan(void)
{
- void *return_value;
+ READ_PLAN_FIELDS(Plan);
- READ_TEMP_LOCALS();
+ READ_DONE();
+}
- token = pg_strtok(&length);
-
-#define MATCH(tokname, namelen) \
- (length == namelen && memcmp(token, tokname, namelen) == 0)
+/*
+ * _readResult
+ */
+static Result *
+_readResult(void)
+{
+ READ_PLAN_FIELDS(Result);
- if (MATCH("QUERY", 5))
- return_value = _readQuery();
- else if (MATCH("WITHCHECKOPTION", 15))
- return_value = _readWithCheckOption();
- else if (MATCH("SORTGROUPCLAUSE", 15))
- return_value = _readSortGroupClause();
- else if (MATCH("GROUPINGSET", 11))
- return_value = _readGroupingSet();
- else if (MATCH("WINDOWCLAUSE", 12))
- return_value = _readWindowClause();
- else if (MATCH("ROWMARKCLAUSE", 13))
- return_value = _readRowMarkClause();
- else if (MATCH("COMMONTABLEEXPR", 15))
- return_value = _readCommonTableExpr();
- else if (MATCH("SETOPERATIONSTMT", 16))
- return_value = _readSetOperationStmt();
- else if (MATCH("ALIAS", 5))
- return_value = _readAlias();
- else if (MATCH("RANGEVAR", 8))
- return_value = _readRangeVar();
- else if (MATCH("INTOCLAUSE", 10))
- return_value = _readIntoClause();
- else if (MATCH("VAR", 3))
- return_value = _readVar();
- else if (MATCH("CONST", 5))
- return_value = _readConst();
- else if (MATCH("PARAM", 5))
- return_value = _readParam();
- else if (MATCH("AGGREF", 6))
- return_value = _readAggref();
- else if (MATCH("GROUPINGFUNC", 12))
- return_value = _readGroupingFunc();
- else if (MATCH("WINDOWFUNC", 10))
- return_value = _readWindowFunc();
- else if (MATCH("ARRAYREF", 8))
- return_value = _readArrayRef();
- else if (MATCH("FUNCEXPR", 8))
- return_value = _readFuncExpr();
- else if (MATCH("NAMEDARGEXPR", 12))
- return_value = _readNamedArgExpr();
- else if (MATCH("OPEXPR", 6))
- return_value = _readOpExpr();
- else if (MATCH("DISTINCTEXPR", 12))
- return_value = _readDistinctExpr();
- else if (MATCH("NULLIFEXPR", 10))
- return_value = _readNullIfExpr();
- else if (MATCH("SCALARARRAYOPEXPR", 17))
- return_value = _readScalarArrayOpExpr();
- else if (MATCH("BOOLEXPR", 8))
- return_value = _readBoolExpr();
- else if (MATCH("SUBLINK", 7))
- return_value = _readSubLink();
- else if (MATCH("FIELDSELECT", 11))
- return_value = _readFieldSelect();
- else if (MATCH("FIELDSTORE", 10))
- return_value = _readFieldStore();
- else if (MATCH("RELABELTYPE", 11))
- return_value = _readRelabelType();
- else if (MATCH("COERCEVIAIO", 11))
- return_value = _readCoerceViaIO();
- else if (MATCH("ARRAYCOERCEEXPR", 15))
- return_value = _readArrayCoerceExpr();
- else if (MATCH("CONVERTROWTYPEEXPR", 18))
- return_value = _readConvertRowtypeExpr();
- else if (MATCH("COLLATE", 7))
- return_value = _readCollateExpr();
- else if (MATCH("CASE", 4))
- return_value = _readCaseExpr();
- else if (MATCH("WHEN", 4))
- return_value = _readCaseWhen();
- else if (MATCH("CASETESTEXPR", 12))
- return_value = _readCaseTestExpr();
- else if (MATCH("ARRAY", 5))
- return_value = _readArrayExpr();
- else if (MATCH("ROW", 3))
- return_value = _readRowExpr();
- else if (MATCH("ROWCOMPARE", 10))
- return_value = _readRowCompareExpr();
- else if (MATCH("COALESCE", 8))
- return_value = _readCoalesceExpr();
- else if (MATCH("MINMAX", 6))
- return_value = _readMinMaxExpr();
- else if (MATCH("XMLEXPR", 7))
- return_value = _readXmlExpr();
- else if (MATCH("NULLTEST", 8))
- return_value = _readNullTest();
- else if (MATCH("BOOLEANTEST", 11))
- return_value = _readBooleanTest();
- else if (MATCH("COERCETODOMAIN", 14))
- return_value = _readCoerceToDomain();
- else if (MATCH("COERCETODOMAINVALUE", 19))
- return_value = _readCoerceToDomainValue();
- else if (MATCH("SETTODEFAULT", 12))
- return_value = _readSetToDefault();
- else if (MATCH("CURRENTOFEXPR", 13))
- return_value = _readCurrentOfExpr();
- else if (MATCH("INFERENCEELEM", 13))
- return_value = _readInferenceElem();
+ READ_NODE_FIELD(resconstantqual);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readModifyTable
+ */
+static ModifyTable *
+_readModifyTable(void)
+{
+ READ_PLAN_FIELDS(ModifyTable);
+
+ READ_ENUM_FIELD(operation, CmdType);
+ READ_BOOL_FIELD(canSetTag);
+ READ_UINT_FIELD(nominalRelation);
+ READ_NODE_FIELD(resultRelations);
+ READ_INT_FIELD(resultRelIndex);
+ READ_NODE_FIELD(plans);
+ READ_NODE_FIELD(withCheckOptionLists);
+ READ_NODE_FIELD(returningLists);
+ READ_NODE_FIELD(fdwPrivLists);
+ READ_NODE_FIELD(rowMarks);
+ READ_INT_FIELD(epqParam);
+ READ_ENUM_FIELD(onConflictAction, OnConflictAction);
+#ifdef XCP
+ if (portable_input)
+ READ_RELID_LIST_FIELD(arbiterIndexes);
+ else
+#endif
+ READ_NODE_FIELD(arbiterIndexes);
+ READ_NODE_FIELD(onConflictSet);
+ READ_NODE_FIELD(onConflictWhere);
+ READ_INT_FIELD(exclRelRTI);
+ READ_NODE_FIELD(exclRelTlist);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readAppend
+ */
+static Append *
+_readAppend(void)
+{
+ READ_PLAN_FIELDS(Append);
+
+ READ_NODE_FIELD(appendplans);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readMergeAppend
+ */
+static MergeAppend *
+_readMergeAppend(void)
+{
+ int i;
+ READ_PLAN_FIELDS(MergeAppend);
+
+ READ_NODE_FIELD(mergeplans);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :sortColIdx */
+ local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->sortColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortOperators */
+ local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->sortOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->sortOperators[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :collations */
+ local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->collations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->collations[i] = InvalidOid;
+ }
+ else
+ local_node->collations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :nullsFirst */
+ local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->nullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readRecursiveUnion
+ */
+static RecursiveUnion *
+_readRecursiveUnion(void)
+{
+ int i;
+ READ_PLAN_FIELDS(RecursiveUnion);
+
+ READ_INT_FIELD(wtParam);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :dupColIdx */
+ local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :dupOperators */
+ local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupOperators[i] = atooid(token);
+ }
+
+ READ_LONG_FIELD(numGroups);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapAnd
+ */
+static BitmapAnd *
+_readBitmapAnd(void)
+{
+ READ_PLAN_FIELDS(BitmapAnd);
+
+ READ_NODE_FIELD(bitmapplans);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapOr
+ */
+static BitmapOr *
+_readBitmapOr(void)
+{
+ READ_PLAN_FIELDS(BitmapOr);
+
+ READ_NODE_FIELD(bitmapplans);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readScan
+ */
+static Scan *
+_readScan(void)
+{
+ READ_SCAN_FIELDS(Scan);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSeqScan
+ */
+static SeqScan *
+_readSeqScan(void)
+{
+ READ_SCAN_FIELDS(SeqScan);
+
+ READ_DONE();
+}
+
+/*
+ * _readSampleScan
+ */
+static SampleScan *
+_readSampleScan(void)
+{
+ READ_SCAN_FIELDS(SampleScan);
++ READ_NODE_FIELD(tablesample);
+
+ READ_DONE();
+}
+
+/*
+ * _readIndexScan
+ */
+static IndexScan *
+_readIndexScan(void)
+{
+ READ_SCAN_FIELDS(IndexScan);
+
+ if (portable_input)
+ READ_RELID_FIELD(indexid);
+ else
+ READ_OID_FIELD(indexid);
+ READ_NODE_FIELD(indexqual);
+ READ_NODE_FIELD(indexqualorig);
+ READ_NODE_FIELD(indexorderby);
+ READ_NODE_FIELD(indexorderbyorig);
+ READ_NODE_FIELD(indexorderbyops);
+ READ_ENUM_FIELD(indexorderdir, ScanDirection);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readIndexOnlyScan
+ */
+static IndexOnlyScan *
+_readIndexOnlyScan(void)
+{
+ READ_SCAN_FIELDS(IndexOnlyScan);
+
+ if (portable_input)
+ READ_RELID_FIELD(indexid);
+ else
+ READ_OID_FIELD(indexid);
+ READ_NODE_FIELD(indexqual);
+ READ_NODE_FIELD(indexorderby);
+ READ_NODE_FIELD(indextlist);
+ READ_ENUM_FIELD(indexorderdir, ScanDirection);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapIndexScan
+ */
+static BitmapIndexScan *
+_readBitmapIndexScan(void)
+{
+ READ_SCAN_FIELDS(BitmapIndexScan);
+
+ if (portable_input)
+ READ_RELID_FIELD(indexid);
+ else
+ READ_OID_FIELD(indexid);
+ READ_NODE_FIELD(indexqual);
+ READ_NODE_FIELD(indexqualorig);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapHeapScan
+ */
+static BitmapHeapScan *
+_readBitmapHeapScan(void)
+{
+ READ_SCAN_FIELDS(BitmapHeapScan);
+
+ READ_NODE_FIELD(bitmapqualorig);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readTidScan
+ */
+static TidScan *
+_readTidScan(void)
+{
+ READ_SCAN_FIELDS(TidScan);
+
+ READ_NODE_FIELD(tidquals);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSubqueryScan
+ */
+static SubqueryScan *
+_readSubqueryScan(void)
+{
+ READ_SCAN_FIELDS(SubqueryScan);
+
+ READ_NODE_FIELD(subplan);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readFunctionScan
+ */
+static FunctionScan *
+_readFunctionScan(void)
+{
+ READ_SCAN_FIELDS(FunctionScan);
+
+ READ_NODE_FIELD(functions);
+ READ_BOOL_FIELD(funcordinality);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readValuesScan
+ */
+static ValuesScan *
+_readValuesScan(void)
+{
+ READ_SCAN_FIELDS(ValuesScan);
+
+ READ_NODE_FIELD(values_lists);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readCteScan
+ */
+static CteScan *
+_readCteScan(void)
+{
+ READ_SCAN_FIELDS(CteScan);
+
+ READ_INT_FIELD(ctePlanId);
+ READ_INT_FIELD(cteParam);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readWorkTableScan
+ */
+static WorkTableScan *
+_readWorkTableScan(void)
+{
+ READ_SCAN_FIELDS(WorkTableScan);
+
+ READ_INT_FIELD(wtParam);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readJoin
+ */
+static Join *
+_readJoin(void)
+{
+ READ_JOIN_FIELDS(Join);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readNestLoop
+ */
+static NestLoop *
+_readNestLoop(void)
+{
+ READ_JOIN_FIELDS(NestLoop);
+
+ READ_NODE_FIELD(nestParams);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readMergeJoin
+ */
+static MergeJoin *
+_readMergeJoin(void)
+{
+ int numCols;
+ int i;
+ READ_JOIN_FIELDS(MergeJoin);
+
+ READ_NODE_FIELD(mergeclauses);
+ numCols = list_length(local_node->mergeclauses);
+
+
+ token = pg_strtok(&length); /* skip :mergeFamilies */
+ local_node->mergeFamilies = (Oid *) palloc(numCols * sizeof(Oid));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->mergeFamilies[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :mergeCollations */
+ local_node->mergeCollations = (Oid *) palloc(numCols * sizeof(Oid));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->mergeCollations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->mergeCollations[i] = InvalidOid;
+ }
+ else
+ local_node->mergeCollations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :mergeStrategies */
+ local_node->mergeStrategies = (int *) palloc(numCols * sizeof(int));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->mergeStrategies[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :mergeNullsFirst */
+ local_node->mergeNullsFirst = (bool *) palloc(numCols * sizeof(bool));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->mergeNullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readHashJoin
+ */
+static HashJoin *
+_readHashJoin(void)
+{
+ READ_JOIN_FIELDS(HashJoin);
+
+ READ_NODE_FIELD(hashclauses);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readMaterial
+ */
+static Material *
+_readMaterial(void)
+{
+ READ_PLAN_FIELDS(Material);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSort
+ */
+static Sort *
+_readSort(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Sort);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :sortColIdx */
+ local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->sortColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortOperators */
+ local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->sortOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->sortOperators[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :collations */
+ local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->collations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->collations[i] = InvalidOid;
+ }
+ else
+ local_node->collations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :nullsFirst */
+ local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->nullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readGroup
+ */
+static Group *
+_readGroup(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Group);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :grpColIdx */
+ local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->grpColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :grpOperators */
+ local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->grpOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->grpOperators[i] = atooid(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readAgg
+ */
+static Agg *
+_readAgg(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Agg);
+
+ READ_ENUM_FIELD(aggstrategy, AggStrategy);
+ READ_ENUM_FIELD(aggdistribution, AggDistribution);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :grpColIdx */
+ local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->grpColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :grpOperators */
+ local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->grpOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->grpOperators[i] = atooid(token);
+ }
+
+ READ_LONG_FIELD(numGroups);
+
+ READ_NODE_FIELD(groupingSets);
+ READ_NODE_FIELD(chain);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readWindowAgg
+ */
+static WindowAgg *
+_readWindowAgg(void)
+{
+ int i;
+ READ_PLAN_FIELDS(WindowAgg);
+
+ READ_INT_FIELD(winref);
+ READ_INT_FIELD(partNumCols);
+
+ token = pg_strtok(&length); /* skip :partColIdx */
+ local_node->partColIdx = (AttrNumber *) palloc(local_node->partNumCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->partNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->partColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :partOperators */
+ local_node->partOperators = (Oid *) palloc(local_node->partNumCols * sizeof(Oid));
+ for (i = 0; i < local_node->partNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->partOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->partOperators[i] = atooid(token);
+ }
+
+ READ_INT_FIELD(ordNumCols);
+
+ token = pg_strtok(&length); /* skip :ordColIdx */
+ local_node->ordColIdx = (AttrNumber *) palloc(local_node->ordNumCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->ordNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->ordColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :ordOperators */
+ local_node->ordOperators = (Oid *) palloc(local_node->ordNumCols * sizeof(Oid));
+ for (i = 0; i < local_node->ordNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->ordOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->ordOperators[i] = atooid(token);
+ }
+
+ READ_INT_FIELD(frameOptions);
+ READ_NODE_FIELD(startOffset);
+ READ_NODE_FIELD(endOffset);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readUnique
+ */
+static Unique *
+_readUnique(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Unique);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :uniqColIdx */
+ local_node->uniqColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->uniqColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :uniqOperators */
+ local_node->uniqOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->uniqOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->uniqOperators[i] = atooid(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readHash
+ */
+static Hash *
+_readHash(void)
+{
+ READ_PLAN_FIELDS(Hash);
+
+ if (portable_input)
+ READ_RELID_FIELD(skewTable);
+ else
+ READ_OID_FIELD(skewTable);
+ READ_INT_FIELD(skewColumn);
+ READ_BOOL_FIELD(skewInherit);
+ if (portable_input)
+ READ_TYPID_FIELD(skewColType);
+ else
+ READ_OID_FIELD(skewColType);
+ READ_INT_FIELD(skewColTypmod);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSetOp
+ */
+static SetOp *
+_readSetOp(void)
+{
+ int i;
+ READ_PLAN_FIELDS(SetOp);
+
+ READ_ENUM_FIELD(cmd, SetOpCmd);
+ READ_ENUM_FIELD(strategy, SetOpStrategy);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :dupColIdx */
+ local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :dupOperators */
+ local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupOperators[i] = atooid(token);
+ }
+
+ READ_INT_FIELD(flagColIdx);
+ READ_INT_FIELD(firstFlag);
+ READ_LONG_FIELD(numGroups);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readLimit
+ */
+static Limit *
+_readLimit(void)
+{
+ READ_PLAN_FIELDS(Limit);
+
+ READ_NODE_FIELD(limitOffset);
+ READ_NODE_FIELD(limitCount);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readRemoteSubplan
+ */
+static RemoteSubplan *
+_readRemoteSubplan(void)
+{
+ READ_SCAN_FIELDS(RemoteSubplan);
+
+ READ_CHAR_FIELD(distributionType);
+ READ_INT_FIELD(distributionKey);
+ READ_NODE_FIELD(distributionNodes);
+ READ_NODE_FIELD(distributionRestrict);
+ READ_NODE_FIELD(nodeList);
+ READ_BOOL_FIELD(execOnAll);
+ READ_NODE_FIELD(sort);
+ READ_STRING_FIELD(cursor);
+ READ_INT_FIELD(unique);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readRemoteStmt
+ */
+static RemoteStmt *
+_readRemoteStmt(void)
+{
+ int i;
+ READ_LOCALS(RemoteStmt);
+
+ READ_ENUM_FIELD(commandType, CmdType);
+ READ_BOOL_FIELD(hasReturning);
+ READ_NODE_FIELD(planTree);
+ READ_NODE_FIELD(rtable);
+ READ_NODE_FIELD(resultRelations);
+ READ_NODE_FIELD(subplans);
+ READ_INT_FIELD(nParamExec);
+ READ_INT_FIELD(nParamRemote);
+ if (local_node->nParamRemote > 0)
+ {
+ local_node->remoteparams = (RemoteParam *) palloc(
+ local_node->nParamRemote * sizeof(RemoteParam));
+ for (i = 0; i < local_node->nParamRemote; i++)
+ {
+ RemoteParam *rparam = &(local_node->remoteparams[i]);
+ token = pg_strtok(&length); /* skip :paramkind */
+ token = pg_strtok(&length);
+ rparam->paramkind = (ParamKind) atoi(token);
+
+ token = pg_strtok(&length); /* skip :paramid */
+ token = pg_strtok(&length);
+ rparam->paramid = atoi(token);
+
+ token = pg_strtok(&length); /* skip :paramtype */
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *typname; /* data type name */
+ token = pg_strtok(&length); /* get nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get typname */
+ typname = nullable_string(token, length);
+ if (typname)
+ rparam->paramtype = get_typname_typid(typname,
+ NSP_OID(nspname));
+ else
+ rparam->paramtype = InvalidOid;
+ }
+ else
+ {
+ token = pg_strtok(&length);
+ rparam->paramtype = atooid(token);
+ }
+ }
+ }
+ else
+ local_node->remoteparams = NULL;
+
+ READ_NODE_FIELD(rowMarks);
+ READ_CHAR_FIELD(distributionType);
+ READ_INT_FIELD(distributionKey);
+ READ_NODE_FIELD(distributionNodes);
+ READ_NODE_FIELD(distributionRestrict);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSimpleSort
+ */
+static SimpleSort *
+_readSimpleSort(void)
+{
+ int i;
+ READ_LOCALS(SimpleSort);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :sortColIdx */
+ local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->sortColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortOperators */
+ local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->sortOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->sortOperators[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortCollations */
+ local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->sortCollations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->sortCollations[i] = InvalidOid;
+ }
+ else
+ local_node->sortCollations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :nullsFirst */
+ local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->nullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readNestLoopParam
+ */
+static NestLoopParam *
+_readNestLoopParam(void)
+{
+ READ_LOCALS(NestLoopParam);
+
+ READ_INT_FIELD(paramno);
+ READ_NODE_FIELD(paramval);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readPlanRowMark
+ */
+static PlanRowMark *
+_readPlanRowMark(void)
+{
+ READ_LOCALS(PlanRowMark);
+
+ READ_UINT_FIELD(rti);
+ READ_UINT_FIELD(prti);
+ READ_UINT_FIELD(rowmarkId);
+ READ_ENUM_FIELD(markType, RowMarkType);
+ READ_INT_FIELD(allMarkTypes);
+ READ_ENUM_FIELD(strength, LockClauseStrength);
+ READ_ENUM_FIELD(waitPolicy, LockWaitPolicy);
+ READ_BOOL_FIELD(isParent);
+
+ READ_DONE();
+}
+
+/*
+ * _readLockRows
+ */
+static LockRows *
+_readLockRows(void)
+{
+ READ_PLAN_FIELDS(LockRows);
+
+ READ_NODE_FIELD(rowMarks);
+ READ_INT_FIELD(epqParam);
+
+ READ_DONE();
+}
+
+#endif /* XCP */
+
+
+/*
+ * parseNodeString
+ *
+ * Given a character string representing a node tree, parseNodeString creates
+ * the internal node structure.
+ *
+ * The string to be read must already have been loaded into pg_strtok().
+ */
+Node *
+parseNodeString(void)
+{
+ void *return_value;
+
+ READ_TEMP_LOCALS();
+
+ token = pg_strtok(&length);
+
+#define MATCH(tokname, namelen) \
+ (length == namelen && memcmp(token, tokname, namelen) == 0)
+
+ if (MATCH("QUERY", 5))
+ return_value = _readQuery();
+ else if (MATCH("WITHCHECKOPTION", 15))
+ return_value = _readWithCheckOption();
+ else if (MATCH("SORTGROUPCLAUSE", 15))
+ return_value = _readSortGroupClause();
+ else if (MATCH("GROUPINGSET", 11))
+ return_value = _readGroupingSet();
+ else if (MATCH("WINDOWCLAUSE", 12))
+ return_value = _readWindowClause();
+ else if (MATCH("ROWMARKCLAUSE", 13))
+ return_value = _readRowMarkClause();
+ else if (MATCH("COMMONTABLEEXPR", 15))
+ return_value = _readCommonTableExpr();
- else if (MATCH("RANGETABLESAMPLE", 16))
- return_value = _readRangeTableSample();
- else if (MATCH("TABLESAMPLECLAUSE", 17))
- return_value = _readTableSampleClause();
+ else if (MATCH("SETOPERATIONSTMT", 16))
+ return_value = _readSetOperationStmt();
+ else if (MATCH("ALIAS", 5))
+ return_value = _readAlias();
+ else if (MATCH("RANGEVAR", 8))
+ return_value = _readRangeVar();
+ else if (MATCH("INTOCLAUSE", 10))
+ return_value = _readIntoClause();
+ else if (MATCH("VAR", 3))
+ return_value = _readVar();
+ else if (MATCH("CONST", 5))
+ return_value = _readConst();
+ else if (MATCH("PARAM", 5))
+ return_value = _readParam();
+ else if (MATCH("AGGREF", 6))
+ return_value = _readAggref();
+ else if (MATCH("GROUPINGFUNC", 12))
+ return_value = _readGroupingFunc();
+ else if (MATCH("WINDOWFUNC", 10))
+ return_value = _readWindowFunc();
+ else if (MATCH("ARRAYREF", 8))
+ return_value = _readArrayRef();
+ else if (MATCH("FUNCEXPR", 8))
+ return_value = _readFuncExpr();
+ else if (MATCH("NAMEDARGEXPR", 12))
+ return_value = _readNamedArgExpr();
+ else if (MATCH("OPEXPR", 6))
+ return_value = _readOpExpr();
+ else if (MATCH("DISTINCTEXPR", 12))
+ return_value = _readDistinctExpr();
+ else if (MATCH("NULLIFEXPR", 10))
+ return_value = _readNullIfExpr();
+ else if (MATCH("SCALARARRAYOPEXPR", 17))
+ return_value = _readScalarArrayOpExpr();
+ else if (MATCH("BOOLEXPR", 8))
+ return_value = _readBoolExpr();
+ else if (MATCH("SUBLINK", 7))
+ return_value = _readSubLink();
+#ifdef XCP
+ else if (MATCH("SUBPLAN", 7))
+ return_value = _readSubPlan();
+#endif
+ else if (MATCH("FIELDSELECT", 11))
+ return_value = _readFieldSelect();
+ else if (MATCH("FIELDSTORE", 10))
+ return_value = _readFieldStore();
+ else if (MATCH("RELABELTYPE", 11))
+ return_value = _readRelabelType();
+ else if (MATCH("COERCEVIAIO", 11))
+ return_value = _readCoerceViaIO();
+ else if (MATCH("ARRAYCOERCEEXPR", 15))
+ return_value = _readArrayCoerceExpr();
+ else if (MATCH("CONVERTROWTYPEEXPR", 18))
+ return_value = _readConvertRowtypeExpr();
+ else if (MATCH("COLLATE", 7))
+ return_value = _readCollateExpr();
+ else if (MATCH("CASE", 4))
+ return_value = _readCaseExpr();
+ else if (MATCH("WHEN", 4))
+ return_value = _readCaseWhen();
+ else if (MATCH("CASETESTEXPR", 12))
+ return_value = _readCaseTestExpr();
+ else if (MATCH("ARRAY", 5))
+ return_value = _readArrayExpr();
+ else if (MATCH("ROW", 3))
+ return_value = _readRowExpr();
+ else if (MATCH("ROWCOMPARE", 10))
+ return_value = _readRowCompareExpr();
+ else if (MATCH("COALESCE", 8))
+ return_value = _readCoalesceExpr();
+ else if (MATCH("MINMAX", 6))
+ return_value = _readMinMaxExpr();
+ else if (MATCH("XMLEXPR", 7))
+ return_value = _readXmlExpr();
+ else if (MATCH("NULLTEST", 8))
+ return_value = _readNullTest();
+ else if (MATCH("BOOLEANTEST", 11))
+ return_value = _readBooleanTest();
+ else if (MATCH("COERCETODOMAIN", 14))
+ return_value = _readCoerceToDomain();
+ else if (MATCH("COERCETODOMAINVALUE", 19))
+ return_value = _readCoerceToDomainValue();
+ else if (MATCH("SETTODEFAULT", 12))
+ return_value = _readSetToDefault();
+ else if (MATCH("CURRENTOFEXPR", 13))
+ return_value = _readCurrentOfExpr();
+ else if (MATCH("INFERENCEELEM", 13))
+ return_value = _readInferenceElem();
else if (MATCH("TARGETENTRY", 11))
return_value = _readTargetEntry();
else if (MATCH("RANGETBLREF", 11))
#include <math.h>
+#include "catalog/pg_namespace.h"
#include "access/sysattr.h"
+ #include "access/tsmapi.h"
#include "catalog/pg_class.h"
#include "catalog/pg_operator.h"
#include "foreign/fdwapi.h"
case RTE_VALUES:
child_rte->lateral = true;
break;
- case RTE_RELATION:
case RTE_JOIN:
case RTE_CTE:
+#ifdef XCP
+ case RTE_REMOTE_DUMMY:
+#endif
/* these can't contain any lateral references */
break;
}
pullup_replace_vars((Node *) rte->values_lists,
context);
break;
- case RTE_RELATION:
case RTE_JOIN:
case RTE_CTE:
+#ifdef XCP
+ case RTE_REMOTE_DUMMY:
+#endif
/* these shouldn't be marked LATERAL */
Assert(false);
break;
required_outer);
pathnode->pathkeys = NIL; /* samplescan has unordered result */
- cost_samplescan(pathnode, root, rel);
+#ifdef XCP
+ set_scanpath_distribution(root, rel, pathnode);
+ if (rel->baserestrictinfo)
+ {
+ ListCell *lc;
+ foreach (lc, rel->baserestrictinfo)
+ {
+ RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+ restrict_distribution(root, ri, pathnode);
+ }
+ }
+#endif
+
+ cost_samplescan(pathnode, root, rel, pathnode->param_info);
return pathnode;
}
loop_count);
}
case T_SubqueryScan:
+#ifdef XCP
+ return create_subqueryscan_path(root, rel, path->pathkeys,
+ required_outer, path->distribution);
+#else
return create_subqueryscan_path(root, rel, path->pathkeys,
required_outer);
- case T_SampleScan:
- return (Path *) create_samplescan_path(root, rel, required_outer);
+#endif
default:
break;
}
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * pgxcship.c
+ * Routines to evaluate expression shippability to remote nodes
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012, Postgres-XC Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/optimizer/util/pgxcship.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_inherits_fn.h"
+#include "catalog/pg_namespace.h"
+#include "catalog/pg_proc.h"
+#ifdef PGXC
+#include "catalog/pg_trigger.h"
+#endif
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "commands/trigger.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/relation.h"
+#include "optimizer/clauses.h"
+#include "optimizer/pgxcplan.h"
+#include "optimizer/pgxcship.h"
+#include "optimizer/tlist.h"
+#include "parser/parsetree.h"
+#include "parser/parse_coerce.h"
+#include "parser/parse_type.h"
+#include "pgxc/locator.h"
+#include "pgxc/pgxcnode.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+
+
+/*
+ * Shippability_context
+ * This context structure is used by the Fast Query Shipping walker, to gather
+ * information during analysing query for Fast Query Shipping.
+ */
+typedef struct
+{
+ bool sc_for_expr; /* if false, the we are checking shippability
+ * of the Query, otherwise, we are checking
+ * shippability of a stand-alone expression.
+ */
+ Bitmapset *sc_shippability; /* The conditions for (un)shippability of the
+ * query.
+ */
+ Query *sc_query; /* the query being analysed for FQS */
+ int sc_query_level; /* level of the query */
+ int sc_max_varlevelsup; /* maximum upper level referred to by any
+ * variable reference in the query. If this
+ * value is greater than 0, the query is not
+ * shippable, if shipped alone.
+ */
+ ExecNodes *sc_exec_nodes; /* nodes where the query should be executed */
+ ExecNodes *sc_subquery_en; /* ExecNodes produced by merging the ExecNodes
+ * for individual subqueries. This gets
+ * ultimately merged with sc_exec_nodes.
+ */
+ bool sc_groupby_has_distcol; /* GROUP BY clause has distribution column */
+} Shippability_context;
+
+/*
+ * ShippabilityStat
+ * List of reasons why a query/expression is not shippable to remote nodes.
+ */
+typedef enum
+{
+ SS_UNSHIPPABLE_EXPR = 0, /* it has unshippable expression */
+ SS_NEED_SINGLENODE, /* Has expressions which can be evaluated when
+ * there is only a single node involved.
+ * Athought aggregates too fit in this class, we
+ * have a separate status to report aggregates,
+ * see below.
+ */
+ SS_NEEDS_COORD, /* the query needs Coordinator */
+ SS_VARLEVEL, /* one of its subqueries has a VAR
+ * referencing an upper level query
+ * relation
+ */
+ SS_NO_NODES, /* no suitable nodes can be found to ship
+ * the query
+ */
+ SS_UNSUPPORTED_EXPR, /* it has expressions currently unsupported
+ * by FQS, but such expressions might be
+ * supported by FQS in future
+ */
+ SS_HAS_AGG_EXPR, /* it has aggregate expressions */
+ SS_UNSHIPPABLE_TYPE, /* the type of expression is unshippable */
+ SS_UNSHIPPABLE_TRIGGER, /* the type of trigger is unshippable */
+ SS_UPDATES_DISTRIBUTION_COLUMN /* query updates the distribution column */
+} ShippabilityStat;
+
+/* Manipulation of shippability reason */
+static bool pgxc_test_shippability_reason(Shippability_context *context,
+ ShippabilityStat reason);
+static void pgxc_set_shippability_reason(Shippability_context *context,
+ ShippabilityStat reason);
+static void pgxc_reset_shippability_reason(Shippability_context *context,
+ ShippabilityStat reason);
+
+/* Evaluation of shippability */
+static bool pgxc_shippability_walker(Node *node, Shippability_context *sc_context);
+static void pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context);
+
+/* Fast-query shipping (FQS) functions */
+static ExecNodes *pgxc_FQS_get_relation_nodes(RangeTblEntry *rte,
+ Index varno,
+ Query *query);
+static ExecNodes *pgxc_FQS_find_datanodes(Query *query);
+static bool pgxc_query_needs_coord(Query *query);
+static bool pgxc_query_contains_only_pg_catalog(List *rtable);
+static bool pgxc_is_var_distrib_column(Var *var, List *rtable);
+static bool pgxc_distinct_has_distcol(Query *query);
+static bool pgxc_targetlist_has_distcol(Query *query);
+static ExecNodes *pgxc_FQS_find_datanodes_recurse(Node *node, Query *query,
+ Bitmapset **relids);
+static ExecNodes *pgxc_FQS_datanodes_for_rtr(Index varno, Query *query);
+
+/*
+ * Set the given reason in Shippability_context indicating why the query can not be
+ * shipped directly to remote nodes.
+ */
+static void
+pgxc_set_shippability_reason(Shippability_context *context, ShippabilityStat reason)
+{
+ context->sc_shippability = bms_add_member(context->sc_shippability, reason);
+}
+
+/*
+ * pgxc_reset_shippability_reason
+ * Reset reason why the query cannot be shipped to remote nodes
+ */
+static void
+pgxc_reset_shippability_reason(Shippability_context *context, ShippabilityStat reason)
+{
+ context->sc_shippability = bms_del_member(context->sc_shippability, reason);
+ return;
+}
+
+
+/*
+ * See if a given reason is why the query can not be shipped directly
+ * to the remote nodes.
+ */
+static bool
+pgxc_test_shippability_reason(Shippability_context *context, ShippabilityStat reason)
+{
+ return bms_is_member(reason, context->sc_shippability);
+}
+
+
+/*
+ * pgxc_set_exprtype_shippability
+ * Set the expression type shippability. For now composite types
+ * derived from view definitions are not shippable.
+ */
+static void
+pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context)
+{
+ char typerelkind;
+
+ typerelkind = get_rel_relkind(typeidTypeRelid(exprtype));
+
+ if (typerelkind == RELKIND_SEQUENCE ||
+ typerelkind == RELKIND_VIEW ||
+ typerelkind == RELKIND_FOREIGN_TABLE)
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_TYPE);
+}
+
+/*
+ * pgxc_FQS_datanodes_for_rtr
+ * For a given RangeTblRef find the datanodes where corresponding data is
+ * located.
+ */
+static ExecNodes *
+pgxc_FQS_datanodes_for_rtr(Index varno, Query *query)
+{
+ RangeTblEntry *rte = rt_fetch(varno, query->rtable);
+ switch (rte->rtekind)
+ {
+ case RTE_RELATION:
+ {
+ /* For anything, other than a table, we can't find the datanodes */
+ if (rte->relkind != RELKIND_RELATION)
+ return NULL;
+ /*
+ * In case of inheritance, child tables can have completely different
+ * Datanode distribution than parent. To handle inheritance we need
+ * to merge the Datanodes of the children table as well. The inheritance
+ * is resolved during planning, so we may not have the RTEs of the
+ * children here. Also, the exact method of merging Datanodes of the
+ * children is not known yet. So, when inheritance is requested, query
+ * can not be shipped.
+ * See prologue of has_subclass, we might miss on the optimization
+ * because has_subclass can return true even if there aren't any
+ * subclasses, but it's ok.
+ */
+ if (rte->inh && has_subclass(rte->relid))
+ return NULL;
+
+ return pgxc_FQS_get_relation_nodes(rte, varno, query);
+ }
+ break;
+
+ /* For any other type of RTE, we return NULL for now */
+ case RTE_JOIN:
+ case RTE_CTE:
+ case RTE_SUBQUERY:
+ case RTE_FUNCTION:
+ case RTE_VALUES:
+ default:
+ return NULL;
+ }
+}
+
+/*
+ * pgxc_FQS_find_datanodes_recurse
+ * Recursively find whether the sub-tree of From Expr rooted under given node is
+ * pushable and if yes where.
+ */
+static ExecNodes *
+pgxc_FQS_find_datanodes_recurse(Node *node, Query *query, Bitmapset **relids)
+{
+ List *query_rtable = query->rtable;
+
+ if (!node)
+ return NULL;
+
+ switch(nodeTag(node))
+ {
+ case T_FromExpr:
+ {
+ FromExpr *from_expr = (FromExpr *)node;
+ ListCell *lcell;
+ bool first;
+ Bitmapset *from_relids;
+ ExecNodes *result_en;
+
+ /*
+ * For INSERT commands, we won't have any entries in the from list.
+ * Get the datanodes using the resultRelation index.
+ */
+ if (query->commandType != CMD_SELECT && !from_expr->fromlist)
+ {
+ *relids = bms_make_singleton(query->resultRelation);
+ return pgxc_FQS_datanodes_for_rtr(query->resultRelation,
+ query);
+ }
+
+ /*
+ * All the entries in the From list are considered to be INNER
+ * joined with the quals as the JOIN condition. Get the datanodes
+ * for the first entry in the From list. For every subsequent entry
+ * determine whether the join between the relation in that entry and
+ * the cumulative JOIN of previous entries can be pushed down to the
+ * datanodes and the corresponding set of datanodes where the join
+ * can be pushed down.
+ */
+ first = true;
+ result_en = NULL;
+ from_relids = NULL;
+ foreach (lcell, from_expr->fromlist)
+ {
+ Node *fromlist_entry = lfirst(lcell);
+ Bitmapset *fle_relids = NULL;
+ ExecNodes *tmp_en;
+ ExecNodes *en = pgxc_FQS_find_datanodes_recurse(fromlist_entry,
+ query, &fle_relids);
+ /*
+ * If any entry in fromlist is not shippable, jointree is not
+ * shippable
+ */
+ if (!en)
+ {
+ FreeExecNodes(&result_en);
+ return NULL;
+ }
+
+ /* FQS does't ship a DML with more than one relation involved */
+ if (!first && query->commandType != CMD_SELECT)
+ {
+ FreeExecNodes(&result_en);
+ return NULL;
+ }
+
+ if (first)
+ {
+ first = false;
+ result_en = en;
+ from_relids = fle_relids;
+ continue;
+ }
+
+ tmp_en = result_en;
+ /*
+ * Check whether the JOIN is pushable to the datanodes and
+ * find the datanodes where the JOIN can be pushed to
+ */
+ result_en = pgxc_is_join_shippable(result_en, en, from_relids,
+ fle_relids, JOIN_INNER,
+ make_ands_implicit((Expr *)from_expr->quals),
+ query_rtable);
+ from_relids = bms_join(from_relids, fle_relids);
+ FreeExecNodes(&tmp_en);
+ }
+
+ *relids = from_relids;
+ return result_en;
+ }
+ break;
+
+ case T_RangeTblRef:
+ {
+ RangeTblRef *rtr = (RangeTblRef *)node;
+ *relids = bms_make_singleton(rtr->rtindex);
+ return pgxc_FQS_datanodes_for_rtr(rtr->rtindex, query);
+ }
+ break;
+
+ case T_JoinExpr:
+ {
+ JoinExpr *join_expr = (JoinExpr *)node;
+ Bitmapset *l_relids = NULL;
+ Bitmapset *r_relids = NULL;
+ ExecNodes *len;
+ ExecNodes *ren;
+ ExecNodes *result_en;
+
+ /* FQS does't ship a DML with more than one relation involved */
+ if (query->commandType != CMD_SELECT)
+ return NULL;
+
+ len = pgxc_FQS_find_datanodes_recurse(join_expr->larg, query,
+ &l_relids);
+ ren = pgxc_FQS_find_datanodes_recurse(join_expr->rarg, query,
+ &r_relids);
+ /* If either side of JOIN is unshippable, JOIN is unshippable */
+ if (!len || !ren)
+ {
+ FreeExecNodes(&len);
+ FreeExecNodes(&ren);
+ return NULL;
+ }
+ /*
+ * Check whether the JOIN is pushable or not, and find the datanodes
+ * where the JOIN can be pushed to.
+ */
+ result_en = pgxc_is_join_shippable(ren, len, r_relids, l_relids,
+ join_expr->jointype,
+ make_ands_implicit((Expr *)join_expr->quals),
+ query_rtable);
+ FreeExecNodes(&len);
+ FreeExecNodes(&ren);
+ *relids = bms_join(l_relids, r_relids);
+ return result_en;
+ }
+ break;
+
+ default:
+ *relids = NULL;
+ return NULL;
+ break;
+ }
+ /* Keep compiler happy */
+ return NULL;
+}
+
+/*
+ * pgxc_FQS_find_datanodes
+ * Find the list of nodes where to ship query.
+ */
+static ExecNodes *
+pgxc_FQS_find_datanodes(Query *query)
+{
+ Bitmapset *relids = NULL;
+ ExecNodes *exec_nodes;
+
+ /*
+ * For SELECT, the datanodes required to execute the query is obtained from
+ * the join tree of the query
+ */
+ exec_nodes = pgxc_FQS_find_datanodes_recurse((Node *)query->jointree,
+ query, &relids);
+ bms_free(relids);
+ relids = NULL;
+
+ /* If we found the datanodes to ship, use them */
+ if (exec_nodes && exec_nodes->nodeList)
+ {
+ /*
+ * If relations involved in the query are such that ultimate JOIN is
+ * replicated JOIN, choose only one of them. If one of them is a
+ * preferred node choose that one, otherwise choose the first one.
+ */
+ if (IsLocatorReplicated(exec_nodes->baselocatortype) &&
+ exec_nodes->accesstype == RELATION_ACCESS_READ)
+ {
+ List *tmp_list = exec_nodes->nodeList;
+ exec_nodes->nodeList = GetPreferredReplicationNode(exec_nodes->nodeList);
+ list_free(tmp_list);
+ }
+ return exec_nodes;
+ }
+ /*
+ * If we found the expression which can decide which can be used to decide
+ * where to ship the query, use that
+ */
+ else if (exec_nodes && exec_nodes->en_expr)
+ return exec_nodes;
+ /* No way to figure out datanodes to ship the query to */
+ return NULL;
+}
+
+
+/*
+ * pgxc_FQS_get_relation_nodes
+ * Return ExecNodes structure so as to decide which node the query should
+ * execute on. If it is possible to set the node list directly, set it.
+ * Otherwise set the appropriate distribution column expression or relid in
+ * ExecNodes structure.
+ */
+static ExecNodes *
+pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query)
+{
+ CmdType command_type = query->commandType;
+ bool for_update = query->rowMarks ? true : false;
+ ExecNodes *rel_exec_nodes;
+ RelationAccessType rel_access = RELATION_ACCESS_READ;
+ RelationLocInfo *rel_loc_info;
+
+ Assert(rte == rt_fetch(varno, (query->rtable)));
+
+ switch (command_type)
+ {
+ case CMD_SELECT:
+ if (for_update)
+ rel_access = RELATION_ACCESS_READ_FOR_UPDATE;
+ else
+ rel_access = RELATION_ACCESS_READ;
+ break;
+
+ case CMD_UPDATE:
+ case CMD_DELETE:
+ rel_access = RELATION_ACCESS_UPDATE;
+ break;
+
+ case CMD_INSERT:
+ rel_access = RELATION_ACCESS_INSERT;
+ break;
+
+ default:
+ /* should not happen, but */
+ elog(ERROR, "Unrecognised command type %d", command_type);
+ break;
+ }
+
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+ /* If we don't know about the distribution of relation, bail out */
+ if (!rel_loc_info)
+ return NULL;
+
+ /*
+ * Find out the datanodes to execute this query on.
+ * PGXC_FQS_TODO: for now, we apply node reduction only when there is only
+ * one relation involved in the query. If there are multiple distributed
+ * tables in the query and we apply node reduction here, we may fail to ship
+ * the entire join. We should apply node reduction transitively.
+ */
+ if (list_length(query->rtable) == 1)
+ rel_exec_nodes = GetRelationNodesByQuals(rte->relid, varno,
+ query->jointree->quals, rel_access);
+ else
+ rel_exec_nodes = GetRelationNodes(rel_loc_info, (Datum) 0,
+ true, rel_access);
+
+ if (!rel_exec_nodes)
+ return NULL;
+
+ if (rel_access == RELATION_ACCESS_INSERT &&
+ IsRelationDistributedByValue(rel_loc_info))
+ {
+ ListCell *lc;
+ TargetEntry *tle;
+ /*
+ * If the INSERT is happening on a table distributed by value of a
+ * column, find out the
+ * expression for distribution column in the targetlist, and stick in
+ * in ExecNodes, and clear the nodelist. Execution will find
+ * out where to insert the row.
+ */
+ /* It is a partitioned table, get value by looking in targetList */
+ foreach(lc, query->targetList)
+ {
+ tle = (TargetEntry *) lfirst(lc);
+
+ if (tle->resjunk)
+ continue;
+ if (strcmp(tle->resname, GetRelationDistribColumn(rel_loc_info)) == 0)
+ break;
+ }
+ /* Not found, bail out */
+ if (!lc)
+ return NULL;
+
+ Assert(tle);
+ /* We found the TargetEntry for the partition column */
+ list_free(rel_exec_nodes->primarynodelist);
+ rel_exec_nodes->primarynodelist = NULL;
+ list_free(rel_exec_nodes->nodeList);
+ rel_exec_nodes->nodeList = NULL;
+ rel_exec_nodes->en_expr = tle->expr;
+ rel_exec_nodes->en_relid = rel_loc_info->relid;
+ }
+ return rel_exec_nodes;
+}
+
+bool
+pgxc_query_has_distcolgrouping(Query *query)
+{
+ ListCell *lcell;
+ foreach (lcell, query->groupClause)
+ {
+ SortGroupClause *sgc = lfirst(lcell);
+ Node *sgc_expr;
+ if (!IsA(sgc, SortGroupClause))
+ continue;
+ sgc_expr = get_sortgroupclause_expr(sgc, query->targetList);
+ if (IsA(sgc_expr, Var) &&
+ pgxc_is_var_distrib_column((Var *)sgc_expr, query->rtable))
+ return true;
+ }
+ return false;
+}
+
+static bool
+pgxc_distinct_has_distcol(Query *query)
+{
+ ListCell *lcell;
+ foreach (lcell, query->distinctClause)
+ {
+ SortGroupClause *sgc = lfirst(lcell);
+ Node *sgc_expr;
+ if (!IsA(sgc, SortGroupClause))
+ continue;
+ sgc_expr = get_sortgroupclause_expr(sgc, query->targetList);
+ if (IsA(sgc_expr, Var) &&
+ pgxc_is_var_distrib_column((Var *)sgc_expr, query->rtable))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * pgxc_shippability_walker
+ * walks the query/expression tree routed at the node passed in, gathering
+ * information which will help decide whether the query to which this node
+ * belongs is shippable to the Datanodes.
+ *
+ * The function should try to walk the entire tree analysing each subquery for
+ * shippability. If a subquery is shippable but not the whole query, we would be
+ * able to create a RemoteQuery node for that subquery, shipping it to the
+ * Datanode.
+ *
+ * Return value of this function is governed by the same rules as
+ * expression_tree_walker(), see prologue of that function for details.
+ */
+static bool
+pgxc_shippability_walker(Node *node, Shippability_context *sc_context)
+{
+ if (node == NULL)
+ return false;
+
+ /* Below is the list of nodes that can appear in a query, examine each
+ * kind of node and find out under what conditions query with this node can
+ * be shippable. For each node, update the context (add fields if
+ * necessary) so that decision whether to FQS the query or not can be made.
+ * Every node which has a result is checked to see if the result type of that
+ * expression is shippable.
+ */
+ switch(nodeTag(node))
+ {
+ /* Constants are always shippable */
+ case T_Const:
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ break;
+
+ /*
+ * For placeholder nodes the shippability of the node, depends upon the
+ * expression which they refer to. It will be checked separately, when
+ * that expression is encountered.
+ */
+ case T_CaseTestExpr:
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ break;
+
+ /*
+ * record_in() function throws error, thus requesting a result in the
+ * form of anonymous record from datanode gets into error. Hence, if the
+ * top expression of a target entry is ROW(), it's not shippable.
+ */
+ case T_TargetEntry:
+ {
+ TargetEntry *tle = (TargetEntry *)node;
+ if (tle->expr)
+ {
+ char typtype = get_typtype(exprType((Node *)tle->expr));
+ if (!typtype || typtype == TYPTYPE_PSEUDO)
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ }
+ break;
+
+ case T_SortGroupClause:
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ break;
+
+ case T_CoerceViaIO:
+ {
+ CoerceViaIO *cvio = (CoerceViaIO *)node;
+ Oid input_type = exprType((Node *)cvio->arg);
+ Oid output_type = cvio->resulttype;
+ CoercionContext cc;
+
+ cc = cvio->coerceformat == COERCE_IMPLICIT_CAST ? COERCION_IMPLICIT :
+ COERCION_EXPLICIT;
+ /*
+ * Internally we use IO coercion for types which do not have casting
+ * defined for them e.g. cstring::date. If such casts are sent to
+ * the datanode, those won't be accepted. Hence such casts are
+ * unshippable. Since it will be shown as an explicit cast.
+ */
+ if (!can_coerce_type(1, &input_type, &output_type, cc))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+ /*
+ * Nodes, which are shippable if the tree rooted under these nodes is
+ * shippable
+ */
+ case T_CoerceToDomainValue:
+ /*
+ * PGXCTODO: mostly, CoerceToDomainValue node appears in DDLs,
+ * do we handle DDLs here?
+ */
+ case T_FieldSelect:
+ case T_NamedArgExpr:
+ case T_RelabelType:
+ case T_BoolExpr:
+ /*
+ * PGXCTODO: we might need to take into account the kind of boolean
+ * operator we have in the quals and see if the corresponding
+ * function is immutable.
+ */
+ case T_ArrayCoerceExpr:
+ case T_ConvertRowtypeExpr:
+ case T_CaseExpr:
+ case T_ArrayExpr:
+ case T_RowExpr:
+ case T_CollateExpr:
+ case T_CoalesceExpr:
+ case T_XmlExpr:
+ case T_NullTest:
+ case T_BooleanTest:
+ case T_CoerceToDomain:
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ break;
+
+ case T_List:
+ case T_RangeTblRef:
+ break;
+
+ case T_ArrayRef:
+ /*
+ * When multiple values of of an array are updated at once
+ * FQS planner cannot yet handle SQL representation correctly.
+ * So disable FQS in this case and let standard planner manage it.
+ */
+ case T_FieldStore:
+ /*
+ * PostgreSQL deparsing logic does not handle the FieldStore
+ * for more than one fields (see processIndirection()). So, let's
+ * handle it through standard planner, where whole row will be
+ * constructed.
+ */
+ case T_SetToDefault:
+ /*
+ * PGXCTODO: we should actually check whether the default value to
+ * be substituted is shippable to the Datanode. Some cases like
+ * nextval() of a sequence can not be shipped to the Datanode, hence
+ * for now default values can not be shipped to the Datanodes
+ */
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ break;
+
+ case T_Var:
+ {
+ Var *var = (Var *)node;
+ /*
+ * if a subquery references an upper level variable, that query is
+ * not shippable, if shipped alone.
+ */
+ if (var->varlevelsup > sc_context->sc_max_varlevelsup)
+ sc_context->sc_max_varlevelsup = var->varlevelsup;
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+
+ case T_Param:
+ {
+ Param *param = (Param *)node;
+ /* PGXCTODO: Can we handle internally generated parameters? */
+ if (param->paramkind != PARAM_EXTERN)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+
+ case T_CurrentOfExpr:
+ {
+ /*
+ * Ideally we should not see CurrentOf expression here, it
+ * should have been replaced by the CTID = ? expression. But
+ * still, no harm in shipping it as is.
+ */
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+
+ case T_Aggref:
+ {
+ Aggref *aggref = (Aggref *)node;
+ /*
+ * An aggregate is completely shippable to the Datanode, if the
+ * whole group resides on that Datanode. This will be clear when
+ * we see the GROUP BY clause.
+ * agglevelsup is minimum of variable's varlevelsup, so we will
+ * set the sc_max_varlevelsup when we reach the appropriate
+ * VARs in the tree.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_HAS_AGG_EXPR);
+ /*
+ * If a stand-alone expression to be shipped, is an
+ * 1. aggregate with ORDER BY, DISTINCT directives, it needs all
+ * the qualifying rows
+ * 2. aggregate without collection function
+ * 3. (PGXCTODO:)aggregate with polymorphic transition type, the
+ * the transition type needs to be resolved to correctly interpret
+ * the transition results from Datanodes.
+ * Hence, such an expression can not be shipped to the datanodes.
+ */
+ if (aggref->aggorder ||
+ aggref->aggdistinct ||
+ aggref->agglevelsup ||
+ !aggref->agghas_collectfn ||
+ IsPolymorphicType(aggref->aggtrantype))
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+
+ case T_FuncExpr:
+ {
+ FuncExpr *funcexpr = (FuncExpr *)node;
+ /*
+ * PGXC_FQS_TODO: it's too restrictive not to ship non-immutable
+ * functions to the Datanode. We need a better way to see what
+ * can be shipped to the Datanode and what can not be.
+ */
+ if (!pgxc_is_func_shippable(funcexpr->funcid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+
+ /*
+ * If this is a stand alone expression and the function returns a
+ * set of rows, we need to handle it along with the final result of
+ * other expressions. So, it can not be shippable.
+ */
+ if (funcexpr->funcretset && sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+
+ case T_OpExpr:
+ case T_DistinctExpr: /* struct-equivalent to OpExpr */
+ case T_NullIfExpr: /* struct-equivalent to OpExpr */
+ {
+ /*
+ * All of these three are structurally equivalent to OpExpr, so
+ * cast the node to OpExpr and check if the operator function is
+ * immutable. See PGXC_FQS_TODO item for FuncExpr.
+ */
+ OpExpr *op_expr = (OpExpr *)node;
+ Oid opfuncid = OidIsValid(op_expr->opfuncid) ?
+ op_expr->opfuncid : get_opcode(op_expr->opno);
+ if (!OidIsValid(opfuncid) ||
+ !pgxc_is_func_shippable(opfuncid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+
+ case T_ScalarArrayOpExpr:
+ {
+ /*
+ * Check if the operator function is shippable to the Datanode
+ * PGXC_FQS_TODO: see immutability note for FuncExpr above
+ */
+ ScalarArrayOpExpr *sao_expr = (ScalarArrayOpExpr *)node;
+ Oid opfuncid = OidIsValid(sao_expr->opfuncid) ?
+ sao_expr->opfuncid : get_opcode(sao_expr->opno);
+ if (!OidIsValid(opfuncid) ||
+ !pgxc_is_func_shippable(opfuncid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ break;
+
+ case T_RowCompareExpr:
+ case T_MinMaxExpr:
+ {
+ /*
+ * PGXCTODO should we be checking the comparision operator
+ * functions as well, as we did for OpExpr OR that check is
+ * unnecessary. Operator functions are always shippable?
+ * Otherwise this node should be treated similar to other
+ * "shell" nodes.
+ */
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
+
+ case T_Query:
+ {
+ Query *query = (Query *)node;
+
+ /* PGXCTODO : If the query has a returning list, it is not shippable as of now */
+ if (query->returningList)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ /* A stand-alone expression containing Query is not shippable */
+ if (sc_context->sc_for_expr)
+ {
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ break;
+ }
+ /*
+ * We are checking shippability of whole query, go ahead. The query
+ * in the context should be same as the query being checked
+ */
+ Assert(query == sc_context->sc_query);
+
+ /* CREATE TABLE AS is not supported in FQS */
+ if (query->commandType == CMD_UTILITY &&
+ IsA(query->utilityStmt, CreateTableAsStmt))
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ if (query->hasRecursive)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ /*
+ * If the query needs Coordinator for evaluation or the query can be
+ * completed on Coordinator itself, we don't ship it to the Datanode
+ */
+ if (pgxc_query_needs_coord(query))
+ pgxc_set_shippability_reason(sc_context, SS_NEEDS_COORD);
+
+ /* PGXCTODO: It should be possible to look at the Query and find out
+ * whether it can be completely evaluated on the Datanode just like SELECT
+ * queries. But we need to be careful while finding out the Datanodes to
+ * execute the query on, esp. for the result relations. If one happens to
+ * remove/change this restriction, make sure you change
+ * pgxc_FQS_get_relation_nodes appropriately.
+ * For now DMLs with single rtable entry are candidates for FQS
+ */
+ if (query->commandType != CMD_SELECT && list_length(query->rtable) > 1)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ /*
+ * In following conditions query is shippable when there is only one
+ * Datanode involved
+ * 1. the query has aggregagtes without grouping by distribution
+ * column
+ * 2. the query has window functions
+ * 3. the query has ORDER BY clause
+ * 4. the query has Distinct clause without distribution column in
+ * distinct clause
+ * 5. the query has limit and offset clause
+ */
+ if (query->hasWindowFuncs || query->sortClause ||
+ query->limitOffset || query->limitCount)
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ /*
+ * Presence of aggregates or having clause, implies grouping. In
+ * such cases, the query won't be shippable unless 1. there is only
+ * a single node involved 2. GROUP BY clause has distribution column
+ * in it. In the later case aggregates for a given group are entirely
+ * computable on a single datanode, because all the rows
+ * participating in particular group reside on that datanode.
+ * The distribution column can be of any relation
+ * participating in the query. All the rows of that relation with
+ * the same value of distribution column reside on same node.
+ */
+ if ((query->hasAggs || query->havingQual) &&
+ !pgxc_query_has_distcolgrouping(query))
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ /*
+ * If distribution column of any relation is present in the distinct
+ * clause, values for that column across nodes will differ, thus two
+ * nodes won't be able to produce same result row. Hence in such
+ * case, we can execute the queries on many nodes managing to have
+ * distinct result.
+ */
+ if (query->distinctClause && !pgxc_distinct_has_distcol(query))
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+
+ if ((query->commandType == CMD_UPDATE) &&
+ pgxc_targetlist_has_distcol(query))
+ pgxc_set_shippability_reason(sc_context, SS_UPDATES_DISTRIBUTION_COLUMN);
+
+
+ /*
+ * walk the entire query tree to analyse the query. We will walk the
+ * range table, when examining the FROM clause. No need to do it
+ * here
+ */
+ if (query_tree_walker(query, pgxc_shippability_walker,
+ sc_context, QTW_IGNORE_RANGE_TABLE ))
+ return true;
+
+ /*
+ * PGXC_FQS_TODO:
+ * There is a subquery in this query, which references Vars in the upper
+ * query. For now stop shipping such queries. We should get rid of this
+ * condition.
+ */
+ if (sc_context->sc_max_varlevelsup != 0)
+ pgxc_set_shippability_reason(sc_context, SS_VARLEVEL);
+
+ /*
+ * Walk the join tree of the query and find the
+ * Datanodes needed for evaluating this query
+ */
+ sc_context->sc_exec_nodes = pgxc_FQS_find_datanodes(query);
+ }
+ break;
+
+ case T_FromExpr:
+ {
+ /* We don't expect FromExpr in a stand-alone expression */
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ /*
+ * We will examine the jointree of query separately to determine the
+ * set of datanodes where to execute the query.
+ * If this is an INSERT query with quals, resulting from say
+ * conditional rule, we can not handle those in FQS, since there is
+ * not SQL representation for such quals.
+ */
+ if (sc_context->sc_query->commandType == CMD_INSERT &&
+ ((FromExpr *)node)->quals)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ }
+ break;
+
+ case T_WindowFunc:
+ {
+ WindowFunc *winf = (WindowFunc *)node;
+ /*
+ * A window function can be evaluated on a Datanode if there is
+ * only one Datanode involved.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ /*
+ * A window function is not shippable as part of a stand-alone
+ * expression. If the window function is non-immutable, it can not
+ * be shipped to the datanodes.
+ */
+ if (sc_context->sc_for_expr ||
+ !pgxc_is_func_shippable(winf->winfnoid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+ }
+ break;
++
++ case T_GroupingFunc:
++ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
++ break;
+
+ case T_WindowClause:
+ {
+ /*
+ * A window function can be evaluated on a Datanode if there is
+ * only one Datanode involved.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ /*
+ * A window function is not shippable as part of a stand-alone
+ * expression
+ */
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ break;
+
+ case T_JoinExpr:
+ /* We don't expect JoinExpr in a stand-alone expression */
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ /*
+ * The shippability of join will be deduced while
+ * examining the jointree of the query. Nothing to do here
+ */
+ break;
+
+ case T_SubLink:
+ {
+ /*
+ * We need to walk the tree in sublink to check for its
+ * shippability. We need to call pgxc_is_query_shippable() on Query
+ * instead of this function so that every subquery gets a different
+ * context for itself. We should avoid the default expression walker
+ * getting called on the subquery. At the same time we don't want to
+ * miss any other member (current or future) of this structure, from
+ * being scanned. So, copy the SubLink structure with subselect
+ * being NULL and call expression_tree_walker on the copied
+ * structure.
+ */
+ SubLink sublink = *(SubLink *)node;
+ ExecNodes *sublink_en;
+ /*
+ * Walk the query and find the nodes where the query should be
+ * executed and node distribution. Merge this with the existing
+ * node list obtained for other subqueries. If merging fails, we
+ * can not ship the whole query.
+ */
+ if (IsA(sublink.subselect, Query))
+ sublink_en = pgxc_is_query_shippable((Query *)(sublink.subselect),
+ sc_context->sc_query_level);
+ else
+ sublink_en = NULL;
+
+ /* PGXCTODO free the old sc_subquery_en. */
+ /* If we already know that this query does not have a set of nodes
+ * to evaluate on, don't bother to merge again.
+ */
+ if (!pgxc_test_shippability_reason(sc_context, SS_NO_NODES))
+ {
+ /*
+ * If this is the first time we are finding out the nodes for
+ * SubLink, we don't have anything to merge, just assign.
+ */
+ if (!sc_context->sc_subquery_en)
+ sc_context->sc_subquery_en = sublink_en;
+ /*
+ * Merge if only the accumulated SubLink ExecNodes and the
+ * ExecNodes for this subquery are both replicated.
+ */
+ else if (sublink_en && IsExecNodesReplicated(sublink_en) &&
+ IsExecNodesReplicated(sc_context->sc_subquery_en))
+ {
+ sc_context->sc_subquery_en = pgxc_merge_exec_nodes(sublink_en,
+ sc_context->sc_subquery_en);
+ }
+ else
+ sc_context->sc_subquery_en = NULL;
+
+ /*
+ * If we didn't find a cumulative ExecNodes, set shippability
+ * reason, so that we don't bother merging future sublinks.
+ */
+ if (!sc_context->sc_subquery_en)
+ pgxc_set_shippability_reason(sc_context, SS_NO_NODES);
+ }
+ else
+ Assert(!sc_context->sc_subquery_en);
+
+ /* Check if the type of sublink result is shippable */
+ pgxc_set_exprtype_shippability(exprType(node), sc_context);
+
+ /* Wipe out subselect as explained above and walk the copied tree */
+ sublink.subselect = NULL;
+ return expression_tree_walker((Node *)&sublink, pgxc_shippability_walker,
+ sc_context);
+ }
+ break;
+
+ case T_SubPlan:
+ case T_AlternativeSubPlan:
+ case T_CommonTableExpr:
+ case T_SetOperationStmt:
+ case T_PlaceHolderVar:
+ case T_AppendRelInfo:
+ case T_PlaceHolderInfo:
+ case T_OnConflictExpr:
+ case T_WithCheckOption:
+ {
+ /* PGXCTODO: till we exhaust this list */
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ /*
+ * These expressions are not supported for shippability entirely, so
+ * there is no need to walk trees underneath those. If we do so, we
+ * might walk the trees with wrong context there.
+ */
+ return false;
+ }
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(node));
+ break;
+ }
+
+ return expression_tree_walker(node, pgxc_shippability_walker, (void *)sc_context);
+}
+
+
+/*
+ * pgxc_query_needs_coord
+ * Check if the query needs Coordinator for evaluation or it can be completely
+ * evaluated on Coordinator. Return true if so, otherwise return false.
+ */
+static bool
+pgxc_query_needs_coord(Query *query)
+{
+ /*
+ * If the query involves just the catalog tables, and is not an EXEC DIRECT
+ * statement, it can be evaluated completely on the Coordinator. No need to
+ * involve Datanodes.
+ */
+ if (pgxc_query_contains_only_pg_catalog(query->rtable))
+ return true;
+
+ return false;
+}
+
+
+/*
+ * pgxc_is_var_distrib_column
+ * Check if given var is a distribution key.
+ */
+static
+bool pgxc_is_var_distrib_column(Var *var, List *rtable)
+{
+ RangeTblEntry *rte = rt_fetch(var->varno, rtable);
+ RelationLocInfo *rel_loc_info;
+
+ /* distribution column only applies to the relations */
+ if (rte->rtekind != RTE_RELATION ||
+ rte->relkind != RELKIND_RELATION)
+ return false;
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+ if (!rel_loc_info)
+ return false;
+ if (var->varattno == rel_loc_info->partAttrNum)
+ return true;
+ return false;
+}
+
+
+/*
+ * Returns whether or not the rtable (and its subqueries)
+ * only contain pg_catalog entries.
+ */
+static bool
+pgxc_query_contains_only_pg_catalog(List *rtable)
+{
+ ListCell *item;
+
+ /* May be complicated. Before giving up, just check for pg_catalog usage */
+ foreach(item, rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(item);
+
+ if (rte->rtekind == RTE_RELATION)
+ {
+ if (get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE)
+ return false;
+ }
+ else if (rte->rtekind == RTE_SUBQUERY &&
+ !pgxc_query_contains_only_pg_catalog(rte->subquery->rtable))
+ return false;
+ }
+ return true;
+}
+
+
+/*
+ * pgxc_is_query_shippable
+ * This function calls the query walker to analyse the query to gather
+ * information like Constraints under which the query can be shippable, nodes
+ * on which the query is going to be executed etc.
+ * Based on the information gathered, it decides whether the query can be
+ * executed on Datanodes directly without involving Coordinator.
+ * If the query is shippable this routine also returns the nodes where the query
+ * should be shipped. If the query is not shippable, it returns NULL.
+ */
+ExecNodes *
+pgxc_is_query_shippable(Query *query, int query_level)
+{
+ Shippability_context sc_context;
+ ExecNodes *exec_nodes;
+ bool canShip = true;
+ Bitmapset *shippability;
+
+ memset(&sc_context, 0, sizeof(sc_context));
+ /* let's assume that by default query is shippable */
+ sc_context.sc_query = query;
+ sc_context.sc_query_level = query_level;
+ sc_context.sc_for_expr = false;
+
+ /*
+ * We might have already decided not to ship the query to the Datanodes, but
+ * still walk it anyway to find out if there are any subqueries which can be
+ * shipped.
+ */
+ pgxc_shippability_walker((Node *)query, &sc_context);
+
+ exec_nodes = sc_context.sc_exec_nodes;
+ /*
+ * The shippability context contains two ExecNodes, one for the subLinks
+ * involved in the Query and other for the relation involved in FromClause.
+ * They are computed at different times while scanning the query. Merge both
+ * of them if they are both replicated. If query doesn't have SubLinks, we
+ * don't need to consider corresponding ExecNodes.
+ * PGXC_FQS_TODO:
+ * Merge the subquery ExecNodes if both of them are replicated.
+ * The logic to merge node lists with other distribution
+ * strategy is not clear yet.
+ */
+ if (query->hasSubLinks)
+ {
+ if (exec_nodes && IsExecNodesReplicated(exec_nodes) &&
+ sc_context.sc_subquery_en &&
+ IsExecNodesReplicated(sc_context.sc_subquery_en))
+ exec_nodes = pgxc_merge_exec_nodes(exec_nodes,
+ sc_context.sc_subquery_en);
+ else
+ exec_nodes = NULL;
+ }
+
+ /*
+ * Look at the information gathered by the walker in Shippability_context and that
+ * in the Query structure to decide whether we should ship this query
+ * directly to the Datanode or not
+ */
+
+ /*
+ * If the planner was not able to find the Datanodes to the execute the
+ * query, the query is not completely shippable. So, return NULL
+ */
+ if (!exec_nodes)
+ return NULL;
+
+ /* Copy the shippability reasons. We modify the copy for easier handling.
+ * The original can be saved away */
+ shippability = bms_copy(sc_context.sc_shippability);
+
+ /*
+ * If the query has an expression which renders the shippability to single
+ * node, and query needs to be shipped to more than one node, it can not be
+ * shipped
+ */
+ if (bms_is_member(SS_NEED_SINGLENODE, shippability))
+ {
+ /*
+ * if nodeList has no nodes, it ExecNodes will have other means to know
+ * the nodes where to execute like distribution column expression. We
+ * can't tell how many nodes the query will be executed on, hence treat
+ * that as multiple nodes.
+ */
+ if (list_length(exec_nodes->nodeList) != 1)
+ canShip = false;
+
+ /* We handled the reason here, reset it */
+ shippability = bms_del_member(shippability, SS_NEED_SINGLENODE);
+ }
+
+ /*
+ * If HAS_AGG_EXPR is set but NEED_SINGLENODE is not set, it means the
+ * aggregates are entirely shippable, so don't worry about it.
+ */
+ shippability = bms_del_member(shippability, SS_HAS_AGG_EXPR);
+
+ /* Can not ship the query for some reason */
+ if (!bms_is_empty(shippability))
+ canShip = false;
+
+ /* Always keep this at the end before checking canShip and return */
+ if (!canShip && exec_nodes)
+ FreeExecNodes(&exec_nodes);
+ /* If query is to be shipped, we should know where to execute the query */
+ Assert (!canShip || exec_nodes);
+
+ bms_free(shippability);
+ shippability = NULL;
+
+ return exec_nodes;
+}
+
+
+/*
+ * pgxc_is_expr_shippable
+ * Check whether the given expression can be shipped to datanodes.
+ *
+ * Note on has_aggs
+ * The aggregate expressions are not shippable if they can not be completely
+ * evaluated on a single datanode. But this function does not have enough
+ * context to determine the set of datanodes where the expression will be
+ * evaluated. Hence, the caller of this function can handle aggregate
+ * expressions, it passes a non-NULL value for has_aggs. This function returns
+ * whether the expression has any aggregates or not through this argument. If a
+ * caller passes NULL value for has_aggs, this function assumes that the caller
+ * can not handle the aggregates and deems the expression has unshippable.
+ */
+bool
+pgxc_is_expr_shippable(Expr *node, bool *has_aggs)
+{
+ Shippability_context sc_context;
+
+ /* Create the FQS context */
+ memset(&sc_context, 0, sizeof(sc_context));
+ sc_context.sc_query = NULL;
+ sc_context.sc_query_level = 0;
+ sc_context.sc_for_expr = true;
+
+ /* Walk the expression to check its shippability */
+ pgxc_shippability_walker((Node *)node, &sc_context);
+
+ /*
+ * If caller is interested in knowing, whether the expression has aggregates
+ * let the caller know about it. The caller is capable of handling such
+ * expressions. Otherwise assume such an expression as not shippable.
+ */
+ if (has_aggs)
+ *has_aggs = pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR);
+ else if (pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR))
+ return false;
+ /* Done with aggregate expression shippability. Delete the status */
+ pgxc_reset_shippability_reason(&sc_context, SS_HAS_AGG_EXPR);
+
+ /* If there are reasons why the expression is unshippable, return false */
+ if (!bms_is_empty(sc_context.sc_shippability))
+ return false;
+
+ /* If nothing wrong found, the expression is shippable */
+ return true;
+}
+
+
+/*
+ * pgxc_is_func_shippable
+ * Determine if a function is shippable
+ */
+bool
+pgxc_is_func_shippable(Oid funcid)
+{
+ /*
+ * For the time being a function is thought as shippable
+ * only if it is immutable.
+ */
+ return func_volatile(funcid) == PROVOLATILE_IMMUTABLE;
+}
+
+
+/*
+ * pgxc_find_dist_equijoin_qual
+ * Check equijoin conditions on given relations
+ */
+Expr *
+pgxc_find_dist_equijoin_qual(Relids varnos_1,
+ Relids varnos_2, Oid distcol_type, Node *quals, List *rtable)
+{
+ List *lquals;
+ ListCell *qcell;
+
+ /* If no quals, no equijoin */
+ if (!quals)
+ return false;
+ /*
+ * Make a copy of the argument bitmaps, it will be modified by
+ * bms_first_member().
+ */
+ varnos_1 = bms_copy(varnos_1);
+ varnos_2 = bms_copy(varnos_2);
+
+ if (!IsA(quals, List))
+ lquals = make_ands_implicit((Expr *)quals);
+ else
+ lquals = (List *)quals;
+
+ foreach(qcell, lquals)
+ {
+ Expr *qual_expr = (Expr *)lfirst(qcell);
+ OpExpr *op;
+ Var *lvar;
+ Var *rvar;
+
+ if (!IsA(qual_expr, OpExpr))
+ continue;
+ op = (OpExpr *)qual_expr;
+ /* If not a binary operator, it can not be '='. */
+ if (list_length(op->args) != 2)
+ continue;
+
+ /*
+ * Check if both operands are Vars, if not check next expression */
+ if (IsA(linitial(op->args), Var) && IsA(lsecond(op->args), Var))
+ {
+ lvar = (Var *)linitial(op->args);
+ rvar = (Var *)lsecond(op->args);
+ }
+ else
+ continue;
+
+ /*
+ * If the data types of both the columns are not same, continue. Hash
+ * and Modulo of a the same bytes will be same if the data types are
+ * same. So, only when the data types of the columns are same, we can
+ * ship a distributed JOIN to the Datanodes
+ */
+ if (exprType((Node *)lvar) != exprType((Node *)rvar))
+ continue;
+
+ /* if the vars do not correspond to the required varnos, continue. */
+ if ((bms_is_member(lvar->varno, varnos_1) && bms_is_member(rvar->varno, varnos_2)) ||
+ (bms_is_member(lvar->varno, varnos_2) && bms_is_member(rvar->varno, varnos_1)))
+ {
+ if (!pgxc_is_var_distrib_column(lvar, rtable) ||
+ !pgxc_is_var_distrib_column(rvar, rtable))
+ continue;
+ }
+ else
+ continue;
+ /*
+ * If the operator is not an assignment operator, check next
+ * constraint. An operator is an assignment operator if it's
+ * mergejoinable or hashjoinable. Beware that not every assignment
+ * operator is mergejoinable or hashjoinable, so we might leave some
+ * oportunity. But then we have to rely on the opname which may not
+ * be something we know to be equality operator as well.
+ */
+ if (!op_mergejoinable(op->opno, exprType((Node *)lvar)) &&
+ !op_hashjoinable(op->opno, exprType((Node *)lvar)))
+ continue;
+ /* Found equi-join condition on distribution columns */
+ return qual_expr;
+ }
+ return NULL;
+}
+
+
+/*
+ * pgxc_merge_exec_nodes
+ * The routine combines the two exec_nodes passed such that the resultant
+ * exec_node corresponds to the JOIN of respective relations.
+ * If both exec_nodes can not be merged, it returns NULL.
+ */
+ExecNodes *
+pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2)
+{
+ ExecNodes *merged_en = makeNode(ExecNodes);
+ ExecNodes *tmp_en;
+
+ /* If either of exec_nodes are NULL, return the copy of other one */
+ if (!en1)
+ {
+ tmp_en = copyObject(en2);
+ return tmp_en;
+ }
+ if (!en2)
+ {
+ tmp_en = copyObject(en1);
+ return tmp_en;
+ }
+
+ /* Following cases are not handled in this routine */
+ /* PGXC_FQS_TODO how should we handle table usage type? */
+ if (en1->primarynodelist || en2->primarynodelist ||
+ en1->en_expr || en2->en_expr ||
+ OidIsValid(en1->en_relid) || OidIsValid(en2->en_relid) ||
+ en1->accesstype != RELATION_ACCESS_READ || en2->accesstype != RELATION_ACCESS_READ)
+ return NULL;
+
+ if (IsExecNodesReplicated(en1) &&
+ IsExecNodesReplicated(en2))
+ {
+ /*
+ * Replicated/replicated join case
+ * Check that replicated relation is not disjoint
+ * with initial relation which is also replicated.
+ * If there is a common portion of the node list between
+ * the two relations, other rtables have to be checked on
+ * this restricted list.
+ */
+ merged_en->nodeList = list_intersection_int(en1->nodeList,
+ en2->nodeList);
+ merged_en->baselocatortype = LOCATOR_TYPE_REPLICATED;
+ if (!merged_en->nodeList)
+ FreeExecNodes(&merged_en);
+ return merged_en;
+ }
+
+ if (IsExecNodesReplicated(en1) &&
+ IsExecNodesColumnDistributed(en2))
+ {
+ List *diff_nodelist = NULL;
+ /*
+ * Replicated/distributed join case.
+ * Node list of distributed table has to be included
+ * in node list of replicated table.
+ */
+ diff_nodelist = list_difference_int(en2->nodeList, en1->nodeList);
+ /*
+ * If the difference list is not empty, this means that node list of
+ * distributed table is not completely mapped by node list of replicated
+ * table, so go through standard planner.
+ */
+ if (diff_nodelist)
+ FreeExecNodes(&merged_en);
+ else
+ {
+ merged_en->nodeList = list_copy(en2->nodeList);
+ merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
+ }
+ return merged_en;
+ }
+
+ if (IsExecNodesColumnDistributed(en1) &&
+ IsExecNodesReplicated(en2))
+ {
+ List *diff_nodelist = NULL;
+ /*
+ * Distributed/replicated join case.
+ * Node list of distributed table has to be included
+ * in node list of replicated table.
+ */
+ diff_nodelist = list_difference_int(en1->nodeList, en2->nodeList);
+
+ /*
+ * If the difference list is not empty, this means that node list of
+ * distributed table is not completely mapped by node list of replicated
+ * table, so go through standard planner.
+ */
+ if (diff_nodelist)
+ FreeExecNodes(&merged_en);
+ else
+ {
+ merged_en->nodeList = list_copy(en1->nodeList);
+ merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
+ }
+ return merged_en;
+ }
+
+ if (IsExecNodesColumnDistributed(en1) &&
+ IsExecNodesColumnDistributed(en2))
+ {
+ /*
+ * Distributed/distributed case
+ * If the caller has suggested that this is an equi-join between two
+ * distributed results, check that they have the same nodes in the distribution
+ * node list. The caller is expected to fully decide whether to merge
+ * the nodes or not.
+ */
+ if (!list_difference_int(en1->nodeList, en2->nodeList) &&
+ !list_difference_int(en2->nodeList, en1->nodeList))
+ {
+ merged_en->nodeList = list_copy(en1->nodeList);
+ if (en1->baselocatortype == en2->baselocatortype)
+ merged_en->baselocatortype = en1->baselocatortype;
+ else
+ merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
+ }
+ else
+ FreeExecNodes(&merged_en);
+ return merged_en;
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XC does not support this distribution type yet"),
+ errdetail("The feature is not currently supported")));
+
+ /* Keep compiler happy */
+ return NULL;
+}
+
+
+/*
+ * pgxc_check_index_shippability
+ * Check shippability of index described by given conditions. This generic
+ * function can be called even if the index is not yet defined.
+ */
+bool
+pgxc_check_index_shippability(RelationLocInfo *relLocInfo,
+ bool is_primary,
+ bool is_unique,
+ bool is_exclusion,
+ List *indexAttrs,
+ List *indexExprs)
+{
+ bool result = true;
+ ListCell *lc;
+
+ /*
+ * Leave if no locator information, in this case shippability has no
+ * meaning.
+ */
+ if (!relLocInfo)
+ return result;
+
+ /*
+ * Scan the expressions used in index and check the shippability of each
+ * of them. If only one is not-shippable, the index is considered as non
+ * shippable. It is important to check the shippability of the expressions
+ * before refining scan on the index columns and distribution type of
+ * parent relation.
+ */
+ foreach(lc, indexExprs)
+ {
+ if (!pgxc_is_expr_shippable((Expr *) lfirst(lc), NULL))
+ {
+ /* One of the expressions is not shippable, so leave */
+ result = false;
+ goto finish;
+ }
+ }
+
+ /*
+ * Check if relation is distributed on a single node, in this case
+ * the constraint can be shipped in all the cases.
+ */
+ if (list_length(relLocInfo->nodeList) == 1)
+ return result;
+
+ /*
+ * Check the case of EXCLUSION index.
+ * EXCLUSION constraints are shippable only for replicated relations as
+ * such constraints need that one tuple is checked on all the others, and
+ * if this tuple is correctly excluded of the others, the constraint is
+ * verified.
+ */
+ if (is_exclusion)
+ {
+ if (!IsRelationReplicated(relLocInfo))
+ {
+ result = false;
+ goto finish;
+ }
+ }
+
+ /*
+ * Check the case of PRIMARY KEY INDEX and UNIQUE index.
+ * Those constraints are shippable if the parent relation is replicated
+ * or if the column
+ */
+ if (is_unique ||
+ is_primary)
+ {
+ /*
+ * Perform different checks depending on distribution type of parent
+ * relation.
+ */
+ switch(relLocInfo->locatorType)
+ {
+ case LOCATOR_TYPE_REPLICATED:
+ /* In the replicated case this index is shippable */
+ result = true;
+ break;
+
+ case LOCATOR_TYPE_RROBIN:
+ /*
+ * Index on roundrobin parent table cannot be safely shipped
+ * because of the random behavior of data balancing.
+ */
+ result = false;
+ break;
+
+ case LOCATOR_TYPE_HASH:
+ case LOCATOR_TYPE_MODULO:
+ /*
+ * Unique indexes on Hash and Modulo tables are shippable if the
+ * index expression contains all the distribution expressions of
+ * its parent relation.
+ *
+ * Here is a short example with concatenate that cannot be
+ * shipped:
+ * CREATE TABLE aa (a text, b text) DISTRIBUTE BY HASH(a);
+ * CREATE UNIQUE INDEX aap ON aa((a || b));
+ * INSERT INTO aa VALUES ('a', 'abb');
+ * INSERT INTO aa VALUES ('aab', b); -- no error ??!
+ * The output uniqueness is not guaranteed as both INSERT will
+ * go to different nodes. For such simple reasons unique
+ * indexes on distributed tables are not shippable.
+ * Shippability is not even ensured if all the expressions
+ * used as Var are only distributed columns as the hash output of
+ * their value combination does not ensure that query will
+ * be directed to the correct remote node. Uniqueness is not even
+ * protected if the index expression contains only the distribution
+ * column like for that with a cluster of 2 Datanodes:
+ * CREATE TABLE aa (a int) DISTRIBUTE BY HASH(a);
+ * CREATE UNIQUE INDEX aap ON (abs(a));
+ * INSERT INTO aa (2); -- to Datanode 1
+ * INSERT INTO aa (-2); -- to Datanode 2, breaks uniqueness
+ *
+ * PGXCTODO: for the time being distribution key can only be
+ * defined on a single column, so this will need to be changed
+ * onde a relation distribution will be able to be defined based
+ * on an expression of multiple columns.
+ */
+
+ /* Index contains expressions, it cannot be shipped safely */
+ if (indexExprs != NIL)
+ {
+ result = false;
+ break;
+ }
+
+ /* Nothing to do if no attributes */
+ if (indexAttrs == NIL)
+ break;
+
+ /*
+ * Check that distribution column is included in the list of
+ * index columns.
+ */
+ if (!list_member_int(indexAttrs, relLocInfo->partAttrNum))
+ {
+ /*
+ * Distribution column is not in index column list
+ * So index can be enforced remotely.
+ */
+ result = false;
+ break;
+ }
+
+ /*
+ * by being here we are now sure that the index can be enforced
+ * remotely as the distribution column is included in index.
+ */
+ break;
+
+ /* Those types are not supported yet */
+ case LOCATOR_TYPE_RANGE:
+ case LOCATOR_TYPE_NONE:
+ case LOCATOR_TYPE_DISTRIBUTED:
+ case LOCATOR_TYPE_CUSTOM:
+ default:
+ /* Should not come here */
+ Assert(0);
+ }
+ }
+
+finish:
+ return result;
+}
+
+
+/*
+ * pgxc_check_fk_shippabilily
+ * Check the shippability of a parent and a child relation based on the
+ * distribution of each and the columns that are used to reference to
+ * parent and child relation. This can be used for inheritance or foreign
+ * key shippability evaluation.
+ */
+bool
+pgxc_check_fk_shippability(RelationLocInfo *parentLocInfo,
+ RelationLocInfo *childLocInfo,
+ List *parentRefs,
+ List *childRefs)
+{
+ bool result = true;
+
+ Assert(list_length(parentRefs) == list_length(childRefs));
+
+ /*
+ * If either child or parent have no relation data, shippability makes
+ * no sense.
+ */
+ if (!parentLocInfo || !childLocInfo)
+ return result;
+
+ /* In the case of a child referencing to itself, constraint is shippable */
+ if (IsLocatorInfoEqual(parentLocInfo, childLocInfo))
+ return result;
+
+ /* Now begin the evaluation */
+ switch (parentLocInfo->locatorType)
+ {
+ case LOCATOR_TYPE_REPLICATED:
+ /*
+ * If the parent relation is replicated, the child relation can
+ * always refer to it on all the nodes.
+ */
+ result = true;
+ break;
+
+ case LOCATOR_TYPE_RROBIN:
+ /*
+ * If the parent relation is based on roundrobin, the child
+ * relation cannot be enforced on remote nodes before of the
+ * random behavior of data balancing.
+ */
+ result = false;
+ break;
+
+ case LOCATOR_TYPE_HASH:
+ case LOCATOR_TYPE_MODULO:
+ /*
+ * If parent table is distributed, the child table can reference
+ * to its parent safely if the following conditions are satisfied:
+ * - parent and child are both hash-based, or both modulo-based
+ * - parent reference columns contain the distribution column
+ * of the parent relation
+ * - child reference columns contain the distribution column
+ * of the child relation
+ * - both child and parent map the same nodes for data location
+ */
+
+ /* A replicated child cannot refer to a distributed parent */
+ if (IsRelationReplicated(childLocInfo))
+ {
+ result = false;
+ break;
+ }
+
+ /*
+ * Parent and child need to have the same distribution type:
+ * hash or modulo.
+ */
+ if (parentLocInfo->locatorType != childLocInfo->locatorType)
+ {
+ result = false;
+ break;
+ }
+
+ /*
+ * Parent and child need to have their data located exactly
+ * on the same list of nodes.
+ */
+ if (list_difference_int(childLocInfo->nodeList, parentLocInfo->nodeList) ||
+ list_difference_int(parentLocInfo->nodeList, childLocInfo->nodeList))
+ {
+ result = false;
+ break;
+ }
+
+ /*
+ * Check that child and parents are referenced using their
+ * distribution column.
+ */
+ if (!list_member_int(childRefs, childLocInfo->partAttrNum) ||
+ !list_member_int(parentRefs, parentLocInfo->partAttrNum))
+ {
+ result = false;
+ break;
+ }
+
+ /* By being here, parent-child constraint can be shipped correctly */
+ break;
+
+ case LOCATOR_TYPE_RANGE:
+ case LOCATOR_TYPE_NONE:
+ case LOCATOR_TYPE_DISTRIBUTED:
+ case LOCATOR_TYPE_CUSTOM:
+ default:
+ /* Should not come here */
+ Assert(0);
+ }
+
+ return result;
+}
+
+/*
+ * pgxc_is_join_reducible
+ * The shippability of JOIN is decided in following steps
+ * 1. Are the JOIN conditions shippable?
+ * For INNER JOIN it's possible to apply some of the conditions at the
+ * Datanodes and others at coordinator. But for other JOINs, JOIN conditions
+ * decide which tuples on the OUTER side are appended with NULL columns from
+ * INNER side, we need all the join conditions to be shippable for the join to
+ * be shippable.
+ * 2. Do the JOIN conditions have quals that will make it shippable?
+ * When both sides of JOIN are replicated, irrespective of the quals the JOIN
+ * is shippable.
+ * INNER joins between replicated and distributed relation are shippable
+ * irrespective of the quals. OUTER join between replicated and distributed
+ * relation is shippable if distributed relation is the outer relation.
+ * All joins between hash/modulo distributed relations are shippable if they
+ * have equi-join on the distributed column, such that distribution columns
+ * have same datatype and same distribution strategy.
+ * 3. Are datanodes where the joining relations exist, compatible?
+ * Joins between replicated relations are shippable if both relations share a
+ * datanode. Joins between distributed relations are shippable if both
+ * relations are distributed on same set of Datanodes. Join between replicated
+ * and distributed relations is shippable is replicated relation is replicated
+ * on all nodes where distributed relation is distributed.
+ *
+ * The first step is to be applied by the caller of this function.
+ */
+ExecNodes *
+pgxc_is_join_shippable(ExecNodes *inner_en, ExecNodes *outer_en, Relids in_relids,
+ Relids out_relids, JoinType jointype, List *join_quals,
+ List *rtables)
+{
+ bool merge_nodes = false;
+
+ /*
+ * If either of inner_en or outer_en is NULL, return NULL. We can't ship the
+ * join when either of the sides do not have datanodes to ship to.
+ */
+ if (!outer_en || !inner_en)
+ return NULL;
+ /*
+ * We only support reduction of INNER, LEFT [OUTER] and FULL [OUTER] joins.
+ * RIGHT [OUTER] join is converted to LEFT [OUTER] join during join tree
+ * deconstruction.
+ */
+ if (jointype != JOIN_INNER && jointype != JOIN_LEFT && jointype != JOIN_FULL)
+ return NULL;
+
+ /* If both sides are replicated or have single node each, we ship any kind of JOIN */
+ if ((IsExecNodesReplicated(inner_en) && IsExecNodesReplicated(outer_en)) ||
+ (list_length(inner_en->nodeList) == 1 &&
+ list_length(outer_en->nodeList) == 1))
+ merge_nodes = true;
+
+ /* If both sides are distributed, ... */
+ else if (IsExecNodesColumnDistributed(inner_en) &&
+ IsExecNodesColumnDistributed(outer_en))
+ {
+ /*
+ * If two sides are distributed in the same manner by a value, with an
+ * equi-join on the distribution column and that condition
+ * is shippable, ship the join if node lists from both sides can be
+ * merged.
+ */
+ if (inner_en->baselocatortype == outer_en->baselocatortype &&
+ IsExecNodesDistributedByValue(inner_en))
+ {
+ Expr *equi_join_expr = pgxc_find_dist_equijoin_qual(in_relids,
+ out_relids, InvalidOid,
+ (Node *)join_quals, rtables);
+ if (equi_join_expr && pgxc_is_expr_shippable(equi_join_expr, NULL))
+ merge_nodes = true;
+ }
+ }
+ /*
+ * If outer side is distributed and inner side is replicated, we can ship
+ * LEFT OUTER and INNER join.
+ */
+ else if (IsExecNodesColumnDistributed(outer_en) &&
+ IsExecNodesReplicated(inner_en) &&
+ (jointype == JOIN_INNER || jointype == JOIN_LEFT))
+ merge_nodes = true;
+ /*
+ * If outer side is replicated and inner side is distributed, we can ship
+ * only for INNER join.
+ */
+ else if (IsExecNodesReplicated(outer_en) &&
+ IsExecNodesColumnDistributed(inner_en) &&
+ jointype == JOIN_INNER)
+ merge_nodes = true;
+ /*
+ * If the ExecNodes of inner and outer nodes can be merged, the JOIN is
+ * shippable
+ */
+ if (merge_nodes)
+ return pgxc_merge_exec_nodes(inner_en, outer_en);
+ else
+ return NULL;
+}
+
+static
+bool pgxc_targetlist_has_distcol(Query *query)
+{
+ RangeTblEntry *rte = rt_fetch(query->resultRelation, query->rtable);
+ RelationLocInfo *rel_loc_info;
+ ListCell *lc;
+ const char *distcol;
+
+ /* distribution column only applies to the relations */
+ if (rte->rtekind != RTE_RELATION ||
+ rte->relkind != RELKIND_RELATION)
+ return false;
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+ if (!rel_loc_info)
+ return false;
+
+ distcol = GetRelationDistribColumn(rel_loc_info);
+ if (!distcol)
+ return false;
+
+ foreach(lc, query->targetList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+ if (tle->resjunk)
+ continue;
+ if (strcmp(tle->resname, distcol) == 0)
+ return true;
+ }
+ return false;
+}
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * execRemote.c
+ *
+ * Functions to execute commands on remote Datanodes
+ *
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/pgxc/pool/execRemote.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <time.h>
+#include "postgres.h"
+#include "access/twophase.h"
+#include "access/gtm.h"
+#include "access/sysattr.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/relscan.h"
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "commands/prepare.h"
+#include "executor/executor.h"
+#include "gtm/gtm_c.h"
+#include "libpq/libpq.h"
+#include "miscadmin.h"
+#include "pgxc/execRemote.h"
+#include "tcop/tcopprot.h"
+#include "executor/nodeSubplan.h"
+#include "nodes/nodeFuncs.h"
+#include "pgstat.h"
+#include "nodes/nodes.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/var.h"
+#include "pgxc/copyops.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/poolmgr.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/pg_rusage.h"
+#include "utils/tuplesort.h"
+#include "utils/snapmgr.h"
+#include "utils/builtins.h"
+#include "pgxc/locator.h"
+#include "pgxc/pgxc.h"
+#include "parser/parse_type.h"
+#include "parser/parsetree.h"
+#include "pgxc/xc_maintenance_mode.h"
+
+/* Enforce the use of two-phase commit when temporary objects are used */
+bool EnforceTwoPhaseCommit = true;
+/*
+ * We do not want it too long, when query is terminating abnormally we just
+ * want to read in already available data, if datanode connection will reach a
+ * consistent state after that, we will go normal clean up procedure: send down
+ * ABORT etc., if data node is not responding we will signal pooler to drop
+ * the connection.
+ * It is better to drop and recreate datanode connection then wait for several
+ * seconds while it being cleaned up when, for example, cancelling query.
+ */
+#define END_QUERY_TIMEOUT 20
+
+typedef struct
+{
+ xact_callback function;
+ void *fparams;
+} abort_callback_type;
+
+/*
+ * Buffer size does not affect performance significantly, just do not allow
+ * connection buffer grows infinitely
+ */
+#define COPY_BUFFER_SIZE 8192
+#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024
+
+/*
+ * Flag to track if a temporary object is accessed by the current transaction
+ */
+static bool temp_object_included = false;
+static abort_callback_type dbcleanup_info = { NULL, NULL };
+
+static int pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections,
+ GlobalTransactionId gxid, bool need_tran_block,
+ bool readOnly, char node_type);
+
+static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate,
+ ExecNodes *exec_nodes,
+ RemoteQueryExecType exec_type,
+ bool is_global_session);
+
+
+static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection,
+ RemoteQueryState *remotestate, Snapshot snapshot);
+
+static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode);
+static bool pgxc_node_remote_finish(char *prepareGID, bool commit,
+ char *nodestring, GlobalTransactionId gxid,
+ GlobalTransactionId prepare_gxid);
+static void pgxc_node_remote_commit(void);
+static void pgxc_node_remote_abort(void);
+static void pgxc_connections_cleanup(ResponseCombiner *combiner);
+
+static void pgxc_node_report_error(ResponseCombiner *combiner);
+
+#define REMOVE_CURR_CONN(combiner) \
+ if ((combiner)->current_conn < --((combiner)->conn_count)) \
+ { \
+ (combiner)->connections[(combiner)->current_conn] = \
+ (combiner)->connections[(combiner)->conn_count]; \
+ } \
+ else \
+ (combiner)->current_conn = 0
+
+#define MAX_STATEMENTS_PER_TRAN 10
+
+/* Variables to collect statistics */
+static int total_transactions = 0;
+static int total_statements = 0;
+static int total_autocommit = 0;
+static int nonautocommit_2pc = 0;
+static int autocommit_2pc = 0;
+static int current_tran_statements = 0;
+static int *statements_per_transaction = NULL;
+static int *nodes_per_transaction = NULL;
+
+/*
+ * statistics collection: count a statement
+ */
+static void
+stat_statement()
+{
+ total_statements++;
+ current_tran_statements++;
+}
+
+/*
+ * To collect statistics: count a transaction
+ */
+static void
+stat_transaction(int node_count)
+{
+ total_transactions++;
+
+ if (!statements_per_transaction)
+ {
+ statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ }
+ if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
+ statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
+ else
+ statements_per_transaction[current_tran_statements]++;
+ current_tran_statements = 0;
+ if (node_count > 0 && node_count <= NumDataNodes)
+ {
+ if (!nodes_per_transaction)
+ {
+ nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
+ memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
+ }
+ nodes_per_transaction[node_count - 1]++;
+ }
+}
+
+
+/*
+ * Output collected statistics to the log
+ */
+static void
+stat_log()
+{
+ elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
+ elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
+ total_autocommit, autocommit_2pc, nonautocommit_2pc);
+ if (total_transactions)
+ {
+ if (statements_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
+ elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
+ i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
+ }
+ elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
+ MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
+ if (nodes_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < NumDataNodes; i++)
+ elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
+ i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
+ }
+ }
+}
+
+
+/*
+ * Create a structure to store parameters needed to combine responses from
+ * multiple connections as well as state information
+ */
+void
+InitResponseCombiner(ResponseCombiner *combiner, int node_count,
+ CombineType combine_type)
+{
+ combiner->node_count = node_count;
+ combiner->connections = NULL;
+ combiner->conn_count = 0;
+ combiner->combine_type = combine_type;
+ combiner->command_complete_count = 0;
+ combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
+ combiner->description_count = 0;
+ combiner->copy_in_count = 0;
+ combiner->copy_out_count = 0;
+ combiner->copy_file = NULL;
+ combiner->errorMessage = NULL;
+ combiner->errorDetail = NULL;
+ combiner->errorHint = NULL;
+ combiner->tuple_desc = NULL;
+ combiner->probing_primary = false;
+ combiner->returning_node = InvalidOid;
+ combiner->currentRow = NULL;
+ combiner->rowBuffer = NIL;
+ combiner->tapenodes = NULL;
+ combiner->merge_sort = false;
+ combiner->extended_query = false;
+ combiner->tapemarks = NULL;
+ combiner->tuplesortstate = NULL;
+ combiner->cursor = NULL;
+ combiner->update_cursor = NULL;
+ combiner->cursor_count = 0;
+ combiner->cursor_connections = NULL;
+ combiner->remoteCopyType = REMOTE_COPY_NONE;
+}
+
+
+/*
+ * Parse out row count from the command status response and convert it to integer
+ */
+static int
+parse_row_count(const char *message, size_t len, uint64 *rowcount)
+{
+ int digits = 0;
+ int pos;
+
+ *rowcount = 0;
+ /* skip \0 string terminator */
+ for (pos = 0; pos < len - 1; pos++)
+ {
+ if (message[pos] >= '0' && message[pos] <= '9')
+ {
+ *rowcount = *rowcount * 10 + message[pos] - '0';
+ digits++;
+ }
+ else
+ {
+ *rowcount = 0;
+ digits = 0;
+ }
+ }
+ return digits;
+}
+
+/*
+ * Convert RowDescription message to a TupleDesc
+ */
+static TupleDesc
+create_tuple_desc(char *msg_body, size_t len)
+{
+ TupleDesc result;
+ int i, nattr;
+ uint16 n16;
+
+ /* get number of attributes */
+ memcpy(&n16, msg_body, 2);
+ nattr = ntohs(n16);
+ msg_body += 2;
+
+ result = CreateTemplateTupleDesc(nattr, false);
+
+ /* decode attributes */
+ for (i = 1; i <= nattr; i++)
+ {
+ AttrNumber attnum;
+ char *attname;
+ char *typname;
+ Oid oidtypeid;
+ int32 typemode, typmod;
+
+ attnum = (AttrNumber) i;
+
+ /* attribute name */
+ attname = msg_body;
+ msg_body += strlen(attname) + 1;
+
+ /* type name */
+ typname = msg_body;
+ msg_body += strlen(typname) + 1;
+
+ /* table OID, ignored */
+ msg_body += 4;
+
+ /* column no, ignored */
+ msg_body += 2;
+
+ /* data type OID, ignored */
+ msg_body += 4;
+
+ /* type len, ignored */
+ msg_body += 2;
+
+ /* type mod */
+ memcpy(&typemode, msg_body, 4);
+ typmod = ntohl(typemode);
+ msg_body += 4;
+
+ /* PGXCTODO text/binary flag? */
+ msg_body += 2;
+
+ /* Get the OID type and mode type from typename */
+ parseTypeString(typname, &oidtypeid, NULL, false);
+
+ TupleDescInitEntry(result, attnum, attname, oidtypeid, typmod, 0);
+ }
+ return result;
+}
+
+/*
+ * Handle CopyOutCommandComplete ('c') message from a Datanode connection
+ */
+static void
+HandleCopyOutComplete(ResponseCombiner *combiner)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'c' message, current request type %d", combiner->request_type)));
+ /* Just do nothing, close message is managed by the Coordinator */
+ combiner->copy_out_count++;
+}
+
+/*
+ * Handle CommandComplete ('C') message from a Datanode connection
+ */
+static void
+HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+{
+ int digits = 0;
+ EState *estate = combiner->ss.ps.state;
+
+ /*
+ * If we did not receive description we are having rowcount or OK response
+ */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COMMAND;
+ /* Extract rowcount */
+ if (combiner->combine_type != COMBINE_TYPE_NONE && estate)
+ {
+ uint64 rowcount;
+ digits = parse_row_count(msg_body, len, &rowcount);
+ if (digits > 0)
+ {
+ /* Replicated write, make sure they are the same */
+ if (combiner->combine_type == COMBINE_TYPE_SAME)
+ {
+ if (combiner->command_complete_count)
+ {
+ /*
+ * Replicated command may succeed on on node and fail on
+ * another. The example is if distributed table referenced
+ * by a foreign key constraint defined on a partitioned
+ * table. If command deletes rows from the replicated table
+ * they may be referenced on one Datanode but not on other.
+ * So, replicated command on each Datanode either affects
+ * proper number of rows, or returns error. Here if
+ * combiner got an error already, we allow to report it,
+ * not the scaring data corruption message.
+ */
+ if (combiner->errorMessage == NULL && rowcount != estate->es_processed)
+ /* There is a consistency issue in the database with the replicated table */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Write to replicated table returned different results from the Datanodes")));
+ }
+ else
+ /* first result */
+ estate->es_processed = rowcount;
+ }
+ else
+ estate->es_processed += rowcount;
+ }
+ else
+ combiner->combine_type = COMBINE_TYPE_NONE;
+ }
+
+ /* If response checking is enable only then do further processing */
+ if (conn->ck_resp_rollback)
+ {
+ if (strcmp(msg_body, "ROLLBACK") == 0)
+ {
+ /*
+ * Subsequent clean up routine will be checking this flag
+ * to determine nodes where to send ROLLBACK PREPARED.
+ * On current node PREPARE has failed and the two-phase record
+ * does not exist, so clean this flag as if PREPARE was not sent
+ * to that node and avoid erroneous command.
+ */
+ conn->ck_resp_rollback = false;
+ /*
+ * Set the error, if none, to force throwing.
+ * If there is error already, it will be thrown anyway, do not add
+ * this potentially confusing message
+ */
+ if (combiner->errorMessage == NULL)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
+ combiner->errorMessage =
+ pstrdup("unexpected ROLLBACK from remote node");
+ MemoryContextSwitchTo(oldcontext);
+ /*
+ * ERRMSG_PRODUCER_ERROR
+ * Messages with this code are replaced by others, if they are
+ * received, so if node will send relevant error message that
+ * one will be replaced.
+ */
+ combiner->errorCode[0] = 'X';
+ combiner->errorCode[1] = 'X';
+ combiner->errorCode[2] = '0';
+ combiner->errorCode[3] = '1';
+ combiner->errorCode[4] = '0';
+ }
+ }
+ }
+ combiner->command_complete_count++;
+}
+
+/*
+ * Handle RowDescription ('T') message from a Datanode connection
+ */
+static bool
+HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return false;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_QUERY;
+ if (combiner->request_type != REQUEST_TYPE_QUERY)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type)));
+ }
+ /* Increment counter and check if it was first */
+ if (combiner->description_count++ == 0)
+ {
+ combiner->tuple_desc = create_tuple_desc(msg_body, len);
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ * Handle CopyInResponse ('G') message from a Datanode connection
+ */
+static void
+HandleCopyIn(ResponseCombiner *combiner)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_IN;
+ if (combiner->request_type != REQUEST_TYPE_COPY_IN)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'G' message, current request type %d", combiner->request_type)));
+ }
+ /*
+ * The normal PG code will output an G message when it runs in the
+ * Coordinator, so do not proxy message here, just count it.
+ */
+ combiner->copy_in_count++;
+}
+
+/*
+ * Handle CopyOutResponse ('H') message from a Datanode connection
+ */
+static void
+HandleCopyOut(ResponseCombiner *combiner)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'H' message, current request type %d", combiner->request_type)));
+ }
+ /*
+ * The normal PG code will output an H message when it runs in the
+ * Coordinator, so do not proxy message here, just count it.
+ */
+ combiner->copy_out_count++;
+}
+
+/*
+ * Handle CopyOutDataRow ('d') message from a Datanode connection
+ */
+static void
+HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_OUT;
+
+ /* Inconsistent responses */
+ if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'd' message, current request type %d", combiner->request_type)));
+
+ /* count the row */
+ combiner->processed++;
+
+ /* Output remote COPY operation to correct location */
+ switch (combiner->remoteCopyType)
+ {
+ case REMOTE_COPY_FILE:
+ /* Write data directly to file */
+ fwrite(msg_body, 1, len, combiner->copy_file);
+ break;
+ case REMOTE_COPY_STDOUT:
+ /* Send back data to client */
+ pq_putmessage('d', msg_body, len);
+ break;
+ case REMOTE_COPY_TUPLESTORE:
+ /*
+ * Do not store trailing \n character.
+ * When tuplestore data are loaded to a table it automatically
+ * inserts line ends.
+ */
+ tuplestore_putmessage(combiner->tuplestorestate, len-1, msg_body);
+ break;
+ case REMOTE_COPY_NONE:
+ default:
+ Assert(0); /* Should not happen */
+ }
+}
+
+/*
+ * Handle DataRow ('D') message from a Datanode connection
+ * The function returns true if data row is accepted and successfully stored
+ * within the combiner.
+ */
+static bool
+HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node)
+{
+ /* We expect previous message is consumed */
+ Assert(combiner->currentRow == NULL);
+
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return false;
+
+ if (combiner->request_type != REQUEST_TYPE_QUERY)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type)));
+ }
+
+ /*
+ * If we got an error already ignore incoming data rows from other nodes
+ * Still we want to continue reading until get CommandComplete
+ */
+ if (combiner->errorMessage)
+ return false;
+
+ /*
+ * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples
+ * from one node, skip others as duplicates
+ */
+ if (combiner->combine_type == COMBINE_TYPE_SAME)
+ {
+ /* Do not return rows when probing primary, instead return when doing
+ * first normal node. Just save some CPU and traffic in case if
+ * probing fails.
+ */
+ if (combiner->probing_primary)
+ return false;
+ if (OidIsValid(combiner->returning_node))
+ {
+ if (combiner->returning_node != node)
+ return false;
+ }
+ else
+ combiner->returning_node = node;
+ }
+
+ /*
+ * We are copying message because it points into connection buffer, and
+ * will be overwritten on next socket read
+ */
+ combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len);
+ memcpy(combiner->currentRow->msg, msg_body, len);
+ combiner->currentRow->msglen = len;
+ combiner->currentRow->msgnode = node;
+
+ return true;
+}
+
+/*
+ * Handle ErrorResponse ('E') message from a Datanode connection
+ */
+static void
+HandleError(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+{
+ /* parse error message */
+ char *code = NULL;
+ char *message = NULL;
+ char *detail = NULL;
+ char *hint = NULL;
+ int offset = 0;
+
+ /*
+ * Scan until point to terminating \0
+ */
+ while (offset + 1 < len)
+ {
+ /* pointer to the field message */
+ char *str = msg_body + offset + 1;
+
+ switch (msg_body[offset])
+ {
+ case 'C': /* code */
+ code = str;
+ break;
+ case 'M': /* message */
+ message = str;
+ break;
+ case 'D': /* details */
+ detail = str;
+ break;
+
+ case 'H': /* hint */
+ hint = str;
+ break;
+
+ /* Fields not yet in use */
+ case 'S': /* severity */
+ case 'R': /* routine */
+ case 'P': /* position string */
+ case 'p': /* position int */
+ case 'q': /* int query */
+ case 'W': /* where */
+ case 'F': /* file */
+ case 'L': /* line */
+ default:
+ break;
+ }
+
+ /* code, message and \0 */
+ offset += strlen(str) + 2;
+ }
+
+ /*
+ * We may have special handling for some errors, default handling is to
+ * throw out error with the same message. We can not ereport immediately
+ * because we should read from this and other connections until
+ * ReadyForQuery is received, so we just store the error message.
+ * If multiple connections return errors only first one is reported.
+ *
+ * The producer error may be hiding primary error, so if previously received
+ * error is a producer error allow it to be overwritten.
+ */
+ if (combiner->errorMessage == NULL ||
+ MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1],
+ combiner->errorCode[2], combiner->errorCode[3],
+ combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
+ combiner->errorMessage = pstrdup(message);
+ /* Error Code is exactly 5 significant bytes */
+ if (code)
+ memcpy(combiner->errorCode, code, 5);
+ if (detail)
+ combiner->errorDetail = pstrdup(detail);
+ if (hint)
+ combiner->errorHint = pstrdup(hint);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /*
+ * If the PREPARE TRANSACTION command fails for whatever reason, we don't
+ * want to send down ROLLBACK PREPARED to this node. Otherwise, it may end
+ * up rolling back an unrelated prepared transaction with the same GID as
+ * used by this transaction
+ */
+ if (conn->ck_resp_rollback)
+ conn->ck_resp_rollback = false;
+
+ /*
+ * If Datanode have sent ErrorResponse it will never send CommandComplete.
+ * Increment the counter to prevent endless waiting for it.
+ */
+ combiner->command_complete_count++;
+}
+
+/*
+ * HandleCmdComplete -
+ * combine deparsed sql statements execution results
+ *
+ * Input parameters:
+ * commandType is dml command type
+ * combineTag is used to combine the completion result
+ * msg_body is execution result needed to combine
+ * len is msg_body size
+ */
+void
+HandleCmdComplete(CmdType commandType, CombineTag *combine,
+ const char *msg_body, size_t len)
+{
+ int digits = 0;
+ uint64 originrowcount = 0;
+ uint64 rowcount = 0;
+ uint64 total = 0;
+
+ if (msg_body == NULL)
+ return;
+
+ /* if there's nothing in combine, just copy the msg_body */
+ if (strlen(combine->data) == 0)
+ {
+ strcpy(combine->data, msg_body);
+ combine->cmdType = commandType;
+ return;
+ }
+ else
+ {
+ /* commandType is conflict */
+ if (combine->cmdType != commandType)
+ return;
+
+ /* get the processed row number from msg_body */
+ digits = parse_row_count(msg_body, len + 1, &rowcount);
+ elog(DEBUG1, "digits is %d\n", digits);
+ Assert(digits >= 0);
+
+ /* no need to combine */
+ if (digits == 0)
+ return;
+
+ /* combine the processed row number */
+ parse_row_count(combine->data, strlen(combine->data) + 1, &originrowcount);
+ elog(DEBUG1, "originrowcount is %lu, rowcount is %lu\n", originrowcount, rowcount);
+ total = originrowcount + rowcount;
+
+ }
+
+ /* output command completion tag */
+ switch (commandType)
+ {
+ case CMD_SELECT:
+ strcpy(combine->data, "SELECT");
+ break;
+ case CMD_INSERT:
+ snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ "INSERT %u %lu", 0, total);
+ break;
+ case CMD_UPDATE:
+ snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ "UPDATE %lu", total);
+ break;
+ case CMD_DELETE:
+ snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ "DELETE %lu", total);
+ break;
+ default:
+ strcpy(combine->data, "");
+ break;
+ }
+
+}
+
+/*
+ * HandleDatanodeCommandId ('M') message from a Datanode connection
+ */
+static void
+HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len)
+{
+ uint32 n32;
+ CommandId cid;
+
+ Assert(msg_body != NULL);
+ Assert(len >= 2);
+
+ /* Get the command Id */
+ memcpy(&n32, &msg_body[0], 4);
+ cid = ntohl(n32);
+
+ /* If received command Id is higher than current one, set it to a new value */
+ if (cid > GetReceivedCommandId())
+ SetReceivedCommandId(cid);
+}
+
+/*
+ * Record waited-for XIDs received from the remote nodes into the transaction
+ * state
+ */
+static void
+HandleWaitXids(char *msg_body, size_t len)
+{
+ int xid_count;
+ uint32 n32;
+ int cur;
+ int i;
+
+ /* Get the xid count */
+ xid_count = len / sizeof (TransactionId);
+
+ cur = 0;
+ for (i = 0; i < xid_count; i++)
+ {
+ Assert(cur < len);
+ memcpy(&n32, &msg_body[cur], sizeof (TransactionId));
+ cur = cur + sizeof (TransactionId);
+ TransactionRecordXidWait(ntohl(n32));
+ }
+}
+
+static void
+HandleGlobalTransactionId(char *msg_body, size_t len)
+{
+ GlobalTransactionId xid;
+
+ Assert(len == sizeof (GlobalTransactionId));
+ memcpy(&xid, &msg_body[0], sizeof (GlobalTransactionId));
+
+ SetTopTransactionId(xid);
+}
+
+/*
+ * Examine the specified combiner state and determine if command was completed
+ * successfully
+ */
+static bool
+validate_combiner(ResponseCombiner *combiner)
+{
+ /* There was error message while combining */
+ if (combiner->errorMessage)
+ return false;
+ /* Check if state is defined */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ return false;
+
+ /* Check all nodes completed */
+ if ((combiner->request_type == REQUEST_TYPE_COMMAND
+ || combiner->request_type == REQUEST_TYPE_QUERY)
+ && combiner->command_complete_count != combiner->node_count)
+ return false;
+
+ /* Check count of description responses */
+ if (combiner->request_type == REQUEST_TYPE_QUERY
+ && combiner->description_count != combiner->node_count)
+ return false;
+
+ /* Check count of copy-in responses */
+ if (combiner->request_type == REQUEST_TYPE_COPY_IN
+ && combiner->copy_in_count != combiner->node_count)
+ return false;
+
+ /* Check count of copy-out responses */
+ if (combiner->request_type == REQUEST_TYPE_COPY_OUT
+ && combiner->copy_out_count != combiner->node_count)
+ return false;
+
+ /* Add other checks here as needed */
+
+ /* All is good if we are here */
+ return true;
+}
+
+/*
+ * Close combiner and free allocated memory, if it is not needed
+ */
+void
+CloseCombiner(ResponseCombiner *combiner)
+{
+ if (combiner->connections)
+ pfree(combiner->connections);
+ if (combiner->tuple_desc)
+ FreeTupleDesc(combiner->tuple_desc);
+ if (combiner->errorMessage)
+ pfree(combiner->errorMessage);
+ if (combiner->errorDetail)
+ pfree(combiner->errorDetail);
+ if (combiner->errorHint)
+ pfree(combiner->errorHint);
+ if (combiner->cursor_connections)
+ pfree(combiner->cursor_connections);
+ if (combiner->tapenodes)
+ pfree(combiner->tapenodes);
+ if (combiner->tapemarks)
+ pfree(combiner->tapemarks);
+}
+
+/*
+ * Validate combiner and release storage freeing allocated memory
+ */
+static bool
+ValidateAndCloseCombiner(ResponseCombiner *combiner)
+{
+ bool valid = validate_combiner(combiner);
+
+ CloseCombiner(combiner);
+
+ return valid;
+}
+
+/*
+ * It is possible if multiple steps share the same Datanode connection, when
+ * executor is running multi-step query or client is running multiple queries
+ * using Extended Query Protocol. After returning next tuple ExecRemoteQuery
+ * function passes execution control to the executor and then it can be given
+ * to the same RemoteQuery or to different one. It is possible that before
+ * returning a tuple the function do not read all Datanode responses. In this
+ * case pending responses should be read in context of original RemoteQueryState
+ * till ReadyForQuery message and data rows should be stored (buffered) to be
+ * available when fetch from that RemoteQueryState is requested again.
+ * BufferConnection function does the job.
+ * If a RemoteQuery is going to use connection it should check connection state.
+ * DN_CONNECTION_STATE_QUERY indicates query has data to read and combiner
+ * points to the original RemoteQueryState. If combiner differs from "this" the
+ * connection should be buffered.
+ */
+void
+BufferConnection(PGXCNodeHandle *conn)
+{
+ ResponseCombiner *combiner = conn->combiner;
+ MemoryContext oldcontext;
+
+ if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY)
+ return;
+
+ elog(DEBUG2, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor);
+
+ /*
+ * When BufferConnection is invoked CurrentContext is related to other
+ * portal, which is trying to control the connection.
+ * TODO See if we can find better context to switch to
+ */
+ oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt);
+
+ /* Verify the connection is in use by the combiner */
+ combiner->current_conn = 0;
+ while (combiner->current_conn < combiner->conn_count)
+ {
+ if (combiner->connections[combiner->current_conn] == conn)
+ break;
+ combiner->current_conn++;
+ }
+ Assert(combiner->current_conn < combiner->conn_count);
+
+ if (combiner->tapemarks == NULL)
+ combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*));
+
+ /*
+ * If current bookmark for the current tape is not set it means either
+ * first row in the buffer is from the current tape or no rows from
+ * the tape in the buffer, so if first row is not from current
+ * connection bookmark the last cell in the list.
+ */
+ if (combiner->tapemarks[combiner->current_conn] == NULL &&
+ list_length(combiner->rowBuffer) > 0)
+ {
+ RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ if (dataRow->msgnode != conn->nodeoid)
+ combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer);
+ }
+
+ /*
+ * Buffer data rows until data node return number of rows specified by the
+ * fetch_size parameter of last Execute message (PortalSuspended message)
+ * or end of result set is reached (CommandComplete message)
+ */
+ while (true)
+ {
+ int res;
+
+ /* Move to buffer currentRow (received from the data node) */
+ if (combiner->currentRow)
+ {
+ combiner->rowBuffer = lappend(combiner->rowBuffer,
+ combiner->currentRow);
+ combiner->currentRow = NULL;
+ }
+
+ res = handle_response(conn, combiner);
+ /*
+ * If response message is a DataRow it will be handled on the next
+ * iteration.
+ * PortalSuspended will cause connection state change and break the loop
+ * The same is for CommandComplete, but we need additional handling -
+ * remove connection from the list of active connections.
+ * We may need to add handling error response
+ */
+
+ /* Most often result check first */
+ if (res == RESPONSE_DATAROW)
+ {
+ /*
+ * The row is in the combiner->currentRow, on next iteration it will
+ * be moved to the buffer
+ */
+ continue;
+ }
+
+ /* incomplete message, read more */
+ if (res == RESPONSE_EOF)
+ {
+ if (pgxc_node_receive(1, &conn, NULL))
+ {
+ conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ add_error_message(conn, "Failed to fetch from data node");
+ }
+ }
+
+ /*
+ * End of result set is reached, so either set the pointer to the
+ * connection to NULL (combiner with sort) or remove it from the list
+ * (combiner without sort)
+ */
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /*
+ * If combiner is doing merge sort we should set reference to the
+ * current connection to NULL in the array, indicating the end
+ * of the tape is reached. FetchTuple will try to access the buffer
+ * first anyway.
+ * Since we remove that reference we can not determine what node
+ * number was this connection, but we need this info to find proper
+ * tuple in the buffer if we are doing merge sort. So store node
+ * number in special array.
+ * NB: We can not test if combiner->tuplesortstate is set here:
+ * connection may require buffering inside tuplesort_begin_merge
+ * - while pre-read rows from the tapes, one of the tapes may be
+ * the local connection with RemoteSubplan in the tree. The
+ * combiner->tuplesortstate is set only after tuplesort_begin_merge
+ * returns.
+ */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ if (combiner->tapenodes == NULL)
+ combiner->tapenodes = (Oid *)
+ palloc0(combiner->conn_count * sizeof(Oid));
+ combiner->tapenodes[combiner->current_conn] = conn->nodeoid;
+ }
+ else
+ {
+ /* Remove current connection, move last in-place, adjust current_conn */
+ if (combiner->current_conn < --combiner->conn_count)
+ combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
+ else
+ combiner->current_conn = 0;
+ }
+ /*
+ * If combiner runs Simple Query Protocol we need to read in
+ * ReadyForQuery. In case of Extended Query Protocol it is not
+ * sent and we should quit.
+ */
+ if (combiner->extended_query)
+ break;
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ if (combiner->extended_query)
+ {
+ /*
+ * Need to sync connection to enable receiving commands
+ * by the datanode
+ */
+ if (pgxc_node_send_sync(conn) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to sync msg to node %u", conn->nodeoid)));
+ }
+ }
+ }
+ else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY)
+ {
+ /* Now it is OK to quit */
+ break;
+ }
+ }
+ Assert(conn->state != DN_CONNECTION_STATE_QUERY);
+ MemoryContextSwitchTo(oldcontext);
+ conn->combiner = NULL;
+}
+
+/*
+ * copy the datarow from combiner to the given slot, in the slot's memory
+ * context
+ */
+static void
+CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot)
+{
+ RemoteDataRow datarow;
+ MemoryContext oldcontext;
+ oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
+ datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen);
+ datarow->msgnode = combiner->currentRow->msgnode;
+ datarow->msglen = combiner->currentRow->msglen;
+ memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen);
+ ExecStoreDataRowTuple(datarow, slot, true);
+ pfree(combiner->currentRow);
+ combiner->currentRow = NULL;
+ MemoryContextSwitchTo(oldcontext);
+}
+
+
+/*
+ * FetchTuple
+ *
+ Get next tuple from one of the datanode connections.
+ * The connections should be in combiner->connections, if "local" dummy
+ * connection presents it should be the last active connection in the array.
+ * If combiner is set up to perform merge sort function returns tuple from
+ * connection defined by combiner->current_conn, or NULL slot if no more tuple
+ * are available from the connection. Otherwise it returns tuple from any
+ * connection or NULL slot if no more available connections.
+ * Function looks into combiner->rowBuffer before accessing connection
+ * and return a tuple from there if found.
+ * Function may wait while more data arrive from the data nodes. If there
+ * is a locally executed subplan function advance it and buffer resulting rows
+ * instead of waiting.
+ */
+TupleTableSlot *
+FetchTuple(ResponseCombiner *combiner)
+{
+ PGXCNodeHandle *conn;
+ TupleTableSlot *slot;
+ Oid nodeOid = -1;
+
+ /*
+ * Case if we run local subplan.
+ * We do not have remote connections, so just get local tuple and return it
+ */
+ if (outerPlanState(combiner))
+ {
+ RemoteSubplanState *planstate = (RemoteSubplanState *) combiner;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ /* Advance subplan in a loop until we have something to return */
+ for (;;)
+ {
+ Datum value = (Datum) 0;
+ bool isnull;
+ int numnodes;
+ int i;
+
+ slot = ExecProcNode(outerPlanState(combiner));
+ /* If locator is not defined deliver all the results */
+ if (planstate->locator == NULL)
+ return slot;
+
+ /*
+ * If NULL tuple is returned we done with the subplan, finish it up and
+ * return NULL
+ */
+ if (TupIsNull(slot))
+ return NULL;
+
+ /* Get partitioning value if defined */
+ if (plan->distributionKey != InvalidAttrNumber)
+ value = slot_getattr(slot, plan->distributionKey, &isnull);
+
+ /* Determine target nodes */
+ numnodes = GET_NODES(planstate->locator, value, isnull, NULL);
+ for (i = 0; i < numnodes; i++)
+ {
+ /* Deliver the node */
+ if (planstate->dest_nodes[i] == PGXCNodeId-1)
+ return slot;
+ }
+ }
+ }
+
+ /*
+ * Get current connection
+ */
+ if (combiner->conn_count > combiner->current_conn)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ conn = NULL;
+
+ /*
+ * If doing merge sort determine the node number.
+ * It may be needed to get buffered row.
+ */
+ if (combiner->merge_sort)
+ {
+ Assert(conn || combiner->tapenodes);
+ nodeOid = conn ? conn->nodeoid :
+ combiner->tapenodes[combiner->current_conn];
+ Assert(OidIsValid(nodeOid));
+ }
+
+ /*
+ * First look into the row buffer.
+ * When we are performing merge sort we need to get from the buffer record
+ * from the connection marked as "current". Otherwise get first.
+ */
+ if (list_length(combiner->rowBuffer) > 0)
+ {
+ RemoteDataRow dataRow;
+
+ Assert(combiner->currentRow == NULL);
+
+ if (combiner->merge_sort)
+ {
+ ListCell *lc;
+ ListCell *prev;
+
+ elog(DEBUG1, "Getting buffered tuple from node %x", nodeOid);
+
+ prev = combiner->tapemarks[combiner->current_conn];
+ if (prev)
+ {
+ /*
+ * Start looking through the list from the bookmark.
+ * Probably the first cell we check contains row from the needed
+ * node. Otherwise continue scanning until we encounter one,
+ * advancing prev pointer as well.
+ */
+ while((lc = lnext(prev)) != NULL)
+ {
+ dataRow = (RemoteDataRow) lfirst(lc);
+ if (dataRow->msgnode == nodeOid)
+ {
+ combiner->currentRow = dataRow;
+ break;
+ }
+ prev = lc;
+ }
+ }
+ else
+ {
+ /*
+ * Either needed row is the first in the buffer or no such row
+ */
+ lc = list_head(combiner->rowBuffer);
+ dataRow = (RemoteDataRow) lfirst(lc);
+ if (dataRow->msgnode == nodeOid)
+ combiner->currentRow = dataRow;
+ else
+ lc = NULL;
+ }
+ if (lc)
+ {
+ /*
+ * Delete cell from the buffer. Before we delete we must check
+ * the bookmarks, if the cell is a bookmark for any tape.
+ * If it is the case we are deleting last row of the current
+ * block from the current tape. That tape should have bookmark
+ * like current, and current bookmark will be advanced when we
+ * read the tape once again.
+ */
+ int i;
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ if (combiner->tapemarks[i] == lc)
+ combiner->tapemarks[i] = prev;
+ }
+ elog(DEBUG1, "Found buffered tuple from node %x", nodeOid);
+ combiner->rowBuffer = list_delete_cell(combiner->rowBuffer,
+ lc, prev);
+ }
+ elog(DEBUG1, "Update tapemark");
+ combiner->tapemarks[combiner->current_conn] = prev;
+ }
+ else
+ {
+ dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ combiner->currentRow = dataRow;
+ combiner->rowBuffer = list_delete_first(combiner->rowBuffer);
+ }
+ }
+
+ /* If we have node message in the currentRow slot, and it is from a proper
+ * node, consume it. */
+ if (combiner->currentRow)
+ {
+ Assert(!combiner->merge_sort ||
+ combiner->currentRow->msgnode == nodeOid);
+ slot = combiner->ss.ps.ps_ResultTupleSlot;
+ CopyDataRowTupleToSlot(combiner, slot);
+ return slot;
+ }
+
+ while (conn)
+ {
+ int res;
+
+ /* Going to use a connection, buffer it if needed */
+ CHECK_OWNERSHIP(conn, combiner);
+
+ /*
+ * If current connection is idle it means portal on the data node is
+ * suspended. Request more and try to get it
+ */
+ if (combiner->extended_query &&
+ conn->state == DN_CONNECTION_STATE_IDLE)
+ {
+ /*
+ * We do not allow to suspend if querying primary node, so that
+ * only may mean the current node is secondary and subplan was not
+ * executed there yet. Return and go on with second phase.
+ */
+ if (combiner->probing_primary)
+ {
+ return NULL;
+ }
+
+ if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (pgxc_node_send_flush(conn) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (pgxc_node_receive(1, &conn, NULL))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed receive data from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
+ }
+ }
+
+ /* read messages */
+ res = handle_response(conn, combiner);
+ if (res == RESPONSE_DATAROW)
+ {
+ slot = combiner->ss.ps.ps_ResultTupleSlot;
+ CopyDataRowTupleToSlot(combiner, slot);
+ return slot;
+ }
+ else if (res == RESPONSE_EOF)
+ {
+ /* incomplete message, read more */
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to receive more data from data node %u", conn->nodeoid)));
+ continue;
+ }
+ else if (res == RESPONSE_SUSPENDED)
+ {
+ /*
+ * If we are doing merge sort or probing primary node we should
+ * remain on the same node, so query next portion immediately.
+ * Otherwise leave node suspended and fetch lazily.
+ */
+ if (combiner->merge_sort || combiner->probing_primary)
+ {
+ if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ if (pgxc_node_send_flush(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed receive node from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
+ continue;
+ }
+
+ /*
+ * Tell the node to fetch data in background, next loop when we
+ * pgxc_node_receive, data is already there, so we can run faster
+ * */
+ if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (pgxc_node_send_flush(conn) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (++combiner->current_conn >= combiner->conn_count)
+ combiner->current_conn = 0;
+ conn = combiner->connections[combiner->current_conn];
+ }
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /*
+ * In case of Simple Query Protocol we should receive ReadyForQuery
+ * before removing connection from the list. In case of Extended
+ * Query Protocol we may remove connection right away.
+ */
+ if (combiner->extended_query)
+ {
+ /* If we are doing merge sort clean current connection and return
+ * NULL, otherwise remove current connection, move last in-place,
+ * adjust current_conn and continue if it is not last connection */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ return NULL;
+ }
+ REMOVE_CURR_CONN(combiner);
+ if (combiner->conn_count > 0)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ return NULL;
+ }
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ /*
+ * If doing Extended Query Protocol we need to sync connection,
+ * otherwise subsequent commands will be ignored.
+ */
+ if (combiner->extended_query)
+ {
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to sync msg to node %u", conn->nodeoid)));
+ }
+ /*
+ * Do not wait for response from primary, it needs to wait
+ * for other nodes to respond. Instead go ahead and send query to
+ * other nodes. It will fail there, but we can continue with
+ * normal cleanup.
+ */
+ if (combiner->probing_primary)
+ {
+ REMOVE_CURR_CONN(combiner);
+ return NULL;
+ }
+ }
+ else if (res == RESPONSE_READY)
+ {
+ /* If we are doing merge sort clean current connection and return
+ * NULL, otherwise remove current connection, move last in-place,
+ * adjust current_conn and continue if it is not last connection */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ return NULL;
+ }
+ REMOVE_CURR_CONN(combiner);
+ if (combiner->conn_count > 0)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ return NULL;
+ }
+ else if (res == RESPONSE_TUPDESC)
+ {
+ ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot,
+ combiner->tuple_desc);
+ /* Now slot is responsible for freeng the descriptor */
+ combiner->tuple_desc = NULL;
+ }
+ else if (res == RESPONSE_ASSIGN_GXID)
+ {
+ /* Do nothing. It must have been handled in handle_response() */
+ }
+ else
+ {
+ // Can not get here?
+ Assert(false);
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Handle responses from the Datanode connections
+ */
+static int
+pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
+ struct timeval * timeout, ResponseCombiner *combiner)
+{
+ int count = conn_count;
+ PGXCNodeHandle *to_receive[conn_count];
+
+ /* make a copy of the pointers to the connections */
+ memcpy(to_receive, connections, conn_count * sizeof(PGXCNodeHandle *));
+
+ /*
+ * Read results.
+ * Note we try and read from Datanode connections even if there is an error on one,
+ * so as to avoid reading incorrect results on the next statement.
+ * Other safegaurds exist to avoid this, however.
+ */
+ while (count > 0)
+ {
+ int i = 0;
+
+ if (pgxc_node_receive(count, to_receive, timeout))
+ return EOF;
+ while (i < count)
+ {
+ int result = handle_response(to_receive[i], combiner);
+ switch (result)
+ {
+ case RESPONSE_EOF: /* have something to read, keep receiving */
+ i++;
+ break;
+ case RESPONSE_COMPLETE:
+ if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL)
+ /* Continue read until ReadyForQuery */
+ break;
+ /* fallthru */
+ case RESPONSE_READY:
+ /* fallthru */
+ case RESPONSE_COPY:
+ /* Handling is done, do not track this connection */
+ count--;
+ /* Move last connection in place */
+ if (i < count)
+ to_receive[i] = to_receive[count];
+ break;
+ case RESPONSE_ERROR:
+ /* no handling needed, just wait for ReadyForQuery */
+ break;
+
+ case RESPONSE_WAITXIDS:
+ break;
+
+ case RESPONSE_ASSIGN_GXID:
+ break;
+
+ default:
+ /* Inconsistent responses */
+ add_error_message(to_receive[i], "Unexpected response from the Datanodes");
+ elog(ERROR, "Unexpected response from the Datanodes, result = %d, request type %d", result, combiner->request_type);
+ /* Stop tracking and move last connection in place */
+ count--;
+ if (i < count)
+ to_receive[i] = to_receive[count];
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Read next message from the connection and update the combiner
+ * and connection state accordingly
+ * If we are in an error state we just consume the messages, and do not proxy
+ * Long term, we should look into cancelling executing statements
+ * and closing the connections.
+ * It returns if states need to be handled
+ * Return values:
+ * RESPONSE_EOF - need to receive more data for the connection
+ * RESPONSE_READY - got ReadyForQuery
+ * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
+ * Also this result is output in case of error
+ * RESPONSE_SUSPENDED - got PortalSuspended
+ * RESPONSE_TUPLEDESC - got tuple description
+ * RESPONSE_DATAROW - got data row
+ * RESPONSE_COPY - got copy response
+ * RESPONSE_BARRIER_OK - barrier command completed successfully
+ */
+int
+handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
+{
+ char *msg;
+ int msg_len;
+ char msg_type;
+
+ for (;;)
+ {
+ /*
+ * If we are in the process of shutting down, we
+ * may be rolling back, and the buffer may contain other messages.
+ * We want to avoid a procarray exception
+ * as well as an error stack overflow.
+ */
+ if (proc_exit_inprogress)
+ conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
+
+ /*
+ * Don't read from from the connection if there is a fatal error.
+ * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
+ * Handling of RESPONSE_ERROR assumes sending SYNC message, but
+ * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
+ * not usable.
+ */
+ if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ return RESPONSE_COMPLETE;
+
+ /* No data available, exit */
+ if (!HAS_MESSAGE_BUFFERED(conn))
+ return RESPONSE_EOF;
+
+ Assert(conn->combiner == combiner || conn->combiner == NULL);
+
+ /* TODO handle other possible responses */
+ msg_type = get_message(conn, &msg_len, &msg);
+ switch (msg_type)
+ {
+ case '\0': /* Not enough data in the buffer */
+ return RESPONSE_EOF;
+ case 'c': /* CopyToCommandComplete */
+ HandleCopyOutComplete(combiner);
+ break;
+ case 'C': /* CommandComplete */
+ HandleCommandComplete(combiner, msg, msg_len, conn);
+ conn->combiner = NULL;
+ if (conn->state == DN_CONNECTION_STATE_QUERY)
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ return RESPONSE_COMPLETE;
+ case 'T': /* RowDescription */
+#ifdef DN_CONNECTION_DEBUG
+ Assert(!conn->have_row_desc);
+ conn->have_row_desc = true;
+#endif
+ if (HandleRowDescription(combiner, msg, msg_len))
+ return RESPONSE_TUPDESC;
+ break;
+ case 'D': /* DataRow */
+#ifdef DN_CONNECTION_DEBUG
+ Assert(conn->have_row_desc);
+#endif
+ /* Do not return if data row has not been actually handled */
+ if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid))
+ return RESPONSE_DATAROW;
+ break;
+ case 's': /* PortalSuspended */
+ /* No activity is expected on the connection until next query */
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ conn->combiner = NULL;
+ return RESPONSE_SUSPENDED;
+ case '1': /* ParseComplete */
+ case '2': /* BindComplete */
+ case '3': /* CloseComplete */
+ case 'n': /* NoData */
+ /* simple notifications, continue reading */
+ break;
+ case 'G': /* CopyInResponse */
+ conn->state = DN_CONNECTION_STATE_COPY_IN;
+ HandleCopyIn(combiner);
+ /* Done, return to caller to let it know the data can be passed in */
+ return RESPONSE_COPY;
+ case 'H': /* CopyOutResponse */
+ conn->state = DN_CONNECTION_STATE_COPY_OUT;
+ HandleCopyOut(combiner);
+ return RESPONSE_COPY;
+ case 'd': /* CopyOutDataRow */
+ conn->state = DN_CONNECTION_STATE_COPY_OUT;
+ HandleCopyDataRow(combiner, msg, msg_len);
+ break;
+ case 'E': /* ErrorResponse */
+ HandleError(combiner, msg, msg_len, conn);
+ add_error_message(conn, combiner->errorMessage);
+ return RESPONSE_ERROR;
+ case 'A': /* NotificationResponse */
+ case 'N': /* NoticeResponse */
+ case 'S': /* SetCommandComplete */
+ /*
+ * Ignore these to prevent multiple messages, one from each
+ * node. Coordinator will send one for DDL anyway
+ */
+ break;
+ case 'Z': /* ReadyForQuery */
+ {
+ /*
+ * Return result depends on previous connection state.
+ * If it was PORTAL_SUSPENDED Coordinator want to send down
+ * another EXECUTE to fetch more rows, otherwise it is done
+ * with the connection
+ */
+ conn->transaction_status = msg[0];
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ conn->combiner = NULL;
+#ifdef DN_CONNECTION_DEBUG
+ conn->have_row_desc = false;
+#endif
+ return RESPONSE_READY;
+ }
+ case 'M': /* Command Id */
+ HandleDatanodeCommandId(combiner, msg, msg_len);
+ break;
+ case 'b':
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ return RESPONSE_BARRIER_OK;
+ case 'I': /* EmptyQuery */
+ return RESPONSE_COMPLETE;
+ case 'W':
+ HandleWaitXids(msg, msg_len);
+ return RESPONSE_WAITXIDS;
+ case 'x':
+ HandleGlobalTransactionId(msg, msg_len);
+ return RESPONSE_ASSIGN_GXID;
+ default:
+ /* sync lost? */
+ elog(WARNING, "Received unsupported message type: %c", msg_type);
+ conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ /* stop reading */
+ return RESPONSE_COMPLETE;
+ }
+ }
+ /* never happen, but keep compiler quiet */
+ return RESPONSE_EOF;
+}
+
+/*
+ * Has the data node sent Ready For Query
+ */
+
+bool
+is_data_node_ready(PGXCNodeHandle * conn)
+{
+ char *msg;
+ int msg_len;
+ char msg_type;
+
+ for (;;)
+ {
+ /*
+ * If we are in the process of shutting down, we
+ * may be rolling back, and the buffer may contain other messages.
+ * We want to avoid a procarray exception
+ * as well as an error stack overflow.
+ */
+ if (proc_exit_inprogress)
+ conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
+
+ /* don't read from from the connection if there is a fatal error */
+ if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ return true;
+
+ /* No data available, exit */
+ if (!HAS_MESSAGE_BUFFERED(conn))
+ return false;
+
+ msg_type = get_message(conn, &msg_len, &msg);
+ if (msg_type == 'Z')
+ {
+ /*
+ * Return result depends on previous connection state.
+ * If it was PORTAL_SUSPENDED Coordinator want to send down
+ * another EXECUTE to fetch more rows, otherwise it is done
+ * with the connection
+ */
+ conn->transaction_status = msg[0];
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ conn->combiner = NULL;
+ return true;
+ }
+ }
+ /* never happen, but keep compiler quiet */
+ return false;
+}
+
+
+/*
+ * Send BEGIN command to the Datanodes or Coordinators and receive responses.
+ * Also send the GXID for the transaction.
+ */
+static int
+pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
+ GlobalTransactionId gxid, bool need_tran_block,
+ bool readOnly, char node_type)
+{
+ int i;
+ struct timeval *timeout = NULL;
+ ResponseCombiner combiner;
+ TimestampTz timestamp = GetCurrentGTMStartTimestamp();
+ PGXCNodeHandle *new_connections[conn_count];
+ int new_count = 0;
+ char *init_str;
+ char lxid[13];
+
+ /*
+ * If no remote connections, we don't have anything to do
+ */
+ if (conn_count == 0)
+ return 0;
+
+ for (i = 0; i < conn_count; i++)
+ {
+ if (!readOnly && !IsConnFromDatanode())
+ connections[i]->read_only = false;
+ /*
+ * PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY
+ * state when we are about to send a BEGIN TRANSACTION command to the
+ * node. We should consider changing the following to an assert and fix
+ * any bugs reported
+ */
+ if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
+ BufferConnection(connections[i]);
+
+ /* Send GXID and check for errors */
+ if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid))
+ return EOF;
+
+ /* Send timestamp and check for errors */
+ if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp))
+ return EOF;
+
+ if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid))
+ need_tran_block = true;
+ else if (IS_PGXC_REMOTE_COORDINATOR)
+ need_tran_block = false;
+ /* Send BEGIN if not already in transaction */
+ if (need_tran_block && connections[i]->transaction_status == 'I')
+ {
+ /* Send the BEGIN TRANSACTION command and check for errors */
+ if (pgxc_node_send_query(connections[i], "BEGIN"))
+ return EOF;
+
+ new_connections[new_count++] = connections[i];
+ }
+ }
+
+ /*
+ * If we did not send a BEGIN command to any node, we are done. Otherwise,
+ * we need to check for any errors and report them
+ */
+ if (new_count == 0)
+ return 0;
+
+ InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ /* Receive responses */
+ if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner))
+ return EOF;
+
+ /* Verify status */
+ if (!ValidateAndCloseCombiner(&combiner))
+ return EOF;
+
+ /* Send virtualXID to the remote nodes using SET command */
+ sprintf(lxid, "%d", MyProc->lxid);
+ PGXCNodeSetParam(true, "coordinator_lxid", lxid);
+
+ /* after transactions are started send down local set commands */
+ init_str = PGXCNodeGetTransactionParamStr();
+ if (init_str)
+ {
+ for (i = 0; i < new_count; i++)
+ {
+ pgxc_node_set_query(new_connections[i], init_str);
+ }
+ }
+
+ /* No problem, let's get going */
+ return 0;
+}
+
+
+/*
+ * Execute DISCARD ALL command on all allocated nodes to remove all session
+ * specific stuff before releasing them to pool for reuse by other sessions.
+ */
+static void
+pgxc_node_remote_cleanup_all(void)
+{
+ PGXCNodeAllHandles *handles = get_current_handles();
+ PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
+ int new_conn_count = 0;
+ int i;
+ char *resetcmd = "RESET ALL;RESET SESSION AUTHORIZATION;"
+ "RESET transaction_isolation;";
+
+ /*
+ * We must handle reader and writer connections both since even a read-only
+ * needs to be cleaned up.
+ */
+ if (handles->co_conn_count + handles->dn_conn_count == 0)
+ return;
+
+ /*
+ * Send down snapshot followed by DISCARD ALL command.
+ */
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = handles->coord_handles[i];
+
+ /* At this point connection should be in IDLE state */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ {
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+
+ /*
+ * We must go ahead and release connections anyway, so do not throw
+ * an error if we have a problem here.
+ */
+ if (pgxc_node_send_query(handle, resetcmd))
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to clean up data nodes")));
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+ new_connections[new_conn_count++] = handle;
+ handle->combiner = NULL;
+ }
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = handles->datanode_handles[i];
+
+ /* At this point connection should be in IDLE state */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ {
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+
+ /*
+ * We must go ahead and release connections anyway, so do not throw
+ * an error if we have a problem here.
+ */
+ if (pgxc_node_send_query(handle, resetcmd))
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to clean up data nodes")));
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+ new_connections[new_conn_count++] = handle;
+ handle->combiner = NULL;
+ }
+
+ if (new_conn_count)
+ {
+ ResponseCombiner combiner;
+ InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner);
+ CloseCombiner(&combiner);
+ }
+ pfree_pgxc_all_handles(handles);
+}
+
+
+/*
+ * Prepare nodes which ran write operations during the transaction.
+ * Read only remote transactions are committed and connections are released
+ * back to the pool.
+ * Function returns the list of nodes where transaction is prepared, including
+ * local node, if requested, in format expected by the GTM server.
+ * If something went wrong the function tries to abort prepared transactions on
+ * the nodes where it succeeded and throws error. A warning is emitted if abort
+ * prepared fails.
+ * After completion remote connection handles are released.
+ */
+static char *
+pgxc_node_remote_prepare(char *prepareGID, bool localNode)
+{
+ bool isOK = true;
+ StringInfoData nodestr;
+ char prepare_cmd[256];
+ char abort_cmd[256];
+ GlobalTransactionId auxXid;
+ char *commit_cmd = "COMMIT TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ initStringInfo(&nodestr);
+ if (localNode)
+ appendStringInfoString(&nodestr, PGXCNodeName);
+
+ sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /*
+ * If something went wrong already we have nothing to do here. The error
+ * will be reported at the end of the function, and we will rollback
+ * remotes as part of the error handling.
+ * Just skip to clean up section and check if we have already prepared
+ * somewhere, we should abort that prepared transaction.
+ */
+ if (!isOK)
+ goto prepare_err;
+
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ if (conn->read_only)
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, commit_cmd))
+ {
+ /*
+ * not a big deal, it was read only, the connection will be
+ * abandoned later.
+ */
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ else
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, prepare_cmd))
+ {
+ /*
+ * That is the trouble, we really want to prepare it.
+ * Just emit warning so far and go to clean up.
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send PREPARE TRANSACTION command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ char *nodename = get_pgxc_nodename(conn->nodeoid);
+ if (nodestr.len > 0)
+ appendStringInfoChar(&nodestr, ',');
+ appendStringInfoString(&nodestr, nodename);
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ /*
+ * If it fails on remote node it would just return ROLLBACK.
+ * Set the flag for the message handler so the response is
+ * verified.
+ */
+ conn->ck_resp_rollback = true;
+ }
+ }
+ }
+ else if (conn->transaction_status == 'E')
+ {
+ /*
+ * Probably can not happen, if there was a error the engine would
+ * abort anyway, even in case of explicit PREPARE.
+ * Anyway, just in case...
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("remote node %u is in error state", conn->nodeoid)));
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /*
+ * If something went wrong already we have nothing to do here. The error
+ * will be reported at the end of the function, and we will rollback
+ * remotes as part of the error handling.
+ * Just skip to clean up section and check if we have already prepared
+ * somewhere, we should abort that prepared transaction.
+ */
+ if (!isOK)
+ goto prepare_err;
+
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ if (conn->read_only)
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, commit_cmd))
+ {
+ /*
+ * not a big deal, it was read only, the connection will be
+ * abandoned later.
+ */
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ else
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, prepare_cmd))
+ {
+ /*
+ * That is the trouble, we really want to prepare it.
+ * Just emit warning so far and go to clean up.
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send PREPARE TRANSACTION command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ char *nodename = get_pgxc_nodename(conn->nodeoid);
+ if (nodestr.len > 0)
+ appendStringInfoChar(&nodestr, ',');
+ appendStringInfoString(&nodestr, nodename);
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ /*
+ * If it fails on remote node it would just return ROLLBACK.
+ * Set the flag for the message handler so the response is
+ * verified.
+ */
+ conn->ck_resp_rollback = true;
+ }
+ }
+ }
+ else if (conn->transaction_status == 'E')
+ {
+ /*
+ * Probably can not happen, if there was a error the engine would
+ * abort anyway, even in case of explicit PREPARE.
+ * Anyway, just in case...
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("remote node %u is in error state", conn->nodeoid)));
+ }
+ }
+
+ SetSendCommandId(false);
+
+ if (!isOK)
+ goto prepare_err;
+
+ /* exit if nothing has been prepared */
+ if (conn_count > 0)
+ {
+ int result;
+ /*
+ * Receive and check for any errors. In case of errors, we don't bail out
+ * just yet. We first go through the list of connections and look for
+ * errors on each connection. This is important to ensure that we run
+ * an appropriate ROLLBACK command later on (prepared transactions must be
+ * rolled back with ROLLBACK PREPARED commands).
+ *
+ * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on
+ * individual connections. The transaction_status field doesn't get set
+ * every time there is an error on the connection. The combiner mechanism is
+ * good for parallel proessing, but I think we should have a leak-proof
+ * mechanism to track connection status
+ */
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ goto prepare_err;
+ else
+ CloseCombiner(&combiner);
+
+ /* Before exit clean the flag, to avoid unnecessary checks */
+ for (i = 0; i < conn_count; i++)
+ connections[i]->ck_resp_rollback = false;
+
+ pfree_pgxc_all_handles(handles);
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+ }
+
+ return nodestr.data;
+prepare_err:
+ sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
+
+ auxXid = GetAuxilliaryTransactionId();
+ conn_count = 0;
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /*
+ * PREPARE succeeded on that node, roll it back there
+ */
+ if (conn->ck_resp_rollback)
+ {
+ conn->ck_resp_rollback = false;
+ /* sanity checks */
+ Assert(conn->sock != NO_SOCKET);
+ Assert(conn->state == DN_CONNECTION_STATE_IDLE);
+ /* Send down abort prepared command */
+ if (pgxc_node_send_gxid(conn, auxXid))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send xid to "
+ "the node %u", conn->nodeoid)));
+ }
+ if (pgxc_node_send_query(conn, abort_cmd))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send ABORT PREPARED command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ if (conn->ck_resp_rollback)
+ {
+ conn->ck_resp_rollback = false;
+ /* sanity checks */
+ Assert(conn->sock != NO_SOCKET);
+ Assert(conn->state = DN_CONNECTION_STATE_IDLE);
+ /* Send down abort prepared command */
+ if (pgxc_node_send_gxid(conn, auxXid))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send xid to "
+ "the node %u", conn->nodeoid)));
+ }
+ if (pgxc_node_send_query(conn, abort_cmd))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send ABORT PREPARED command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+ if (conn_count > 0)
+ {
+ /* Just read out responses, throw error from the first combiner */
+ ResponseCombiner combiner2;
+ InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2);
+ CloseCombiner(&combiner2);
+ }
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+
+ pfree_pgxc_all_handles(handles);
+
+ /*
+ * If the flag is set we are here because combiner carries error message
+ */
+ if (isOK)
+ pgxc_node_report_error(&combiner);
+ else
+ elog(ERROR, "failed to PREPARE transaction on one or more nodes");
+ return NULL;
+}
+
+
+/*
+ * Commit transactions on remote nodes.
+ * If barrier lock is set wait while it is released.
+ * Release remote connection after completion.
+ */
+static void
+pgxc_node_remote_commit(void)
+{
+ int result = 0;
+ char *commitCmd = "COMMIT TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ SetSendCommandId(false);
+
+ /*
+ * Barrier:
+ *
+ * We should acquire the BarrierLock in SHARE mode here to ensure that
+ * there are no in-progress barrier at this point. This mechanism would
+ * work as long as LWLock mechanism does not starve a EXCLUSIVE lock
+ * requester
+ */
+ LWLockAcquire(BarrierLock, LW_SHARED);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ /*
+ * We do not need to commit remote node if it is not in transaction.
+ * If transaction is in error state the commit command will cause
+ * rollback, that is OK
+ */
+ if (conn->transaction_status != 'I')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ if (pgxc_node_send_query(conn, commitCmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to the node %u",
+ conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ /*
+ * We do not need to commit remote node if it is not in transaction.
+ * If transaction is in error state the commit command will cause
+ * rollback, that is OK
+ */
+ if (conn->transaction_status != 'I')
+ {
+ if (pgxc_node_send_query(conn, commitCmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to the node %u",
+ conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ /*
+ * Release the BarrierLock.
+ */
+ LWLockRelease(BarrierLock);
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ result = EOF;
+ else
+ CloseCombiner(&combiner);
+ }
+
+ stat_transaction(conn_count);
+
+ if (result)
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to COMMIT the transaction on one or more nodes")));
+ }
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+
+ pfree_pgxc_all_handles(handles);
+}
+
+
+/*
+ * Rollback transactions on remote nodes.
+ * Release remote connection after completion.
+ */
+static void
+pgxc_node_remote_abort(void)
+{
+ int result = 0;
+ char *rollbackCmd = "ROLLBACK TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ SetSendCommandId(false);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ if (conn->transaction_status != 'I')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ /*
+ * Do not matter, is there committed or failed transaction,
+ * just send down rollback to finish it.
+ */
+ if (pgxc_node_send_query(conn, rollbackCmd))
+ {
+ add_error_message(conn,
+ "failed to send ROLLBACK TRANSACTION command");
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ if (conn->transaction_status != 'I')
+ {
+ /*
+ * Do not matter, is there committed or failed transaction,
+ * just send down rollback to finish it.
+ */
+ if (pgxc_node_send_query(conn, rollbackCmd))
+ {
+ add_error_message(conn,
+ "failed to send ROLLBACK TRANSACTION command");
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ result = EOF;
+ else
+ CloseCombiner(&combiner);
+ }
+
+ stat_transaction(conn_count);
+
+ if (result)
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to ROLLBACK the transaction on one or more nodes")));
+ }
+
+ pfree_pgxc_all_handles(handles);
+}
+
+/*
+ * Begin COPY command
+ * The copy_connections array must have room for NumDataNodes items
+ */
+void
+DataNodeCopyBegin(RemoteCopyData *rcstate)
+{
+ int i;
+ List *nodelist = rcstate->rel_loc->nodeList;
+ PGXCNodeHandle **connections;
+ bool need_tran_block;
+ GlobalTransactionId gxid;
+ ResponseCombiner combiner;
+ Snapshot snapshot = GetActiveSnapshot();
+ int conn_count = list_length(nodelist);
+
+ /* Get needed datanode connections */
+ if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType))
+ {
+ /* Connections is a single handle to read from */
+ connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
+ connections[0] = get_any_handle(nodelist);
+ conn_count = 1;
+ }
+ else
+ {
+ PGXCNodeAllHandles *pgxc_handles;
+ pgxc_handles = get_handles(nodelist, NULL, false, true);
+ connections = pgxc_handles->datanode_handles;
+ Assert(pgxc_handles->dn_conn_count == conn_count);
+ pfree(pgxc_handles);
+ }
+
+ /*
+ * If more than one nodes are involved or if we are already in a
+ * transaction block, we must the remote statements in a transaction block
+ */
+ need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T');
+
+ elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count,
+ need_tran_block ? "true" : "false");
+
+ /* Gather statistics */
+ stat_statement();
+ stat_transaction(conn_count);
+
+ gxid = GetCurrentTransactionId();
+
+ /* Start transaction on connections where it is not started */
+ if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data nodes.")));
+ }
+
+ /*
+ * COPY TO do not use locator, it just takes connections from it, and
+ * we do not look up distribution data type in this case.
+ * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not
+ * defined partType if real locator type is HASH or MODULO.
+ * Create locator before sending down query, because createLocator may
+ * fail and we leave with dirty connections.
+ * If we get an error now datanode connection will be clean and error
+ * handler will issue transaction abort.
+ */
+ rcstate->locator = createLocator(
+ rcstate->is_from ? rcstate->rel_loc->locatorType
+ : LOCATOR_TYPE_RROBIN,
+ rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ,
+ rcstate->dist_type,
+ LOCATOR_LIST_POINTER,
+ conn_count,
+ (void *) connections,
+ NULL,
+ false);
+
+ /* Send query to nodes */
+ for (i = 0; i < conn_count; i++)
+ {
+ CHECK_OWNERSHIP(connections[i], NULL);
+
+ if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot))
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0)
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ }
+
+ /*
+ * We are expecting CopyIn response, but do not want to send it to client,
+ * caller should take care about this, because here we do not know if
+ * client runs console or file copy
+ */
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ /* Receive responses */
+ if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner)
+ || !ValidateAndCloseCombiner(&combiner))
+ {
+ DataNodeCopyFinish(conn_count, connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ pfree(connections);
+}
+
+
+/*
+ * Send a data row to the specified nodes
+ */
+int
+DataNodeCopyIn(char *data_row, int len, int conn_count, PGXCNodeHandle** copy_connections)
+{
+ /* size + data row + \n */
+ int msgLen = 4 + len + 1;
+ int nLen = htonl(msgLen);
+ int i;
+
+ for(i = 0; i < conn_count; i++)
+ {
+ PGXCNodeHandle *handle = copy_connections[i];
+ if (handle->state == DN_CONNECTION_STATE_COPY_IN)
+ {
+ /* precalculate to speed up access */
+ int bytes_needed = handle->outEnd + 1 + msgLen;
+
+ /* flush buffer if it is almost full */
+ if (bytes_needed > COPY_BUFFER_SIZE)
+ {
+ int to_send = handle->outEnd;
+
+ /* First look if data node has sent a error message */
+ int read_status = pgxc_node_read_data(handle, true);
+ if (read_status == EOF || read_status < 0)
+ {
+ add_error_message(handle, "failed to read data from data node");
+ return EOF;
+ }
+
+ if (handle->inStart < handle->inEnd)
+ {
+ ResponseCombiner combiner;
+ InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ handle_response(handle, &combiner);
+ if (!ValidateAndCloseCombiner(&combiner))
+ return EOF;
+ }
+
+ if (DN_CONNECTION_STATE_ERROR(handle))
+ return EOF;
+
+ /*
+ * Try to send down buffered data if we have
+ */
+ if (to_send && send_some(handle, to_send) < 0)
+ {
+ add_error_message(handle, "failed to send data to data node");
+ return EOF;
+ }
+ }
+
+ if (ensure_out_buffer_capacity(bytes_needed, handle) != 0)
+ {
+ ereport(ERROR,