From: Pavan Deolasee Date: Thu, 10 Dec 2015 10:08:17 +0000 (+0530) Subject: Merge remote-tracking branch 'remotes/PGSQL/REL9_5_STABLE' into XL9_5_STABLE X-Git-Tag: XL9_5_R1BETA1~125 X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=2397a09aafbf8397fc427dfaa460136a309bf792;p=postgres-xl.git Merge remote-tracking branch 'remotes/PGSQL/REL9_5_STABLE' into XL9_5_STABLE --- 2397a09aafbf8397fc427dfaa460136a309bf792 diff --cc configure index 0467e3d393,2a22da9d3e..031b4fa22c --- a/configure +++ b/configure @@@ -1,8 -1,8 +1,8 @@@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. - # Generated by GNU Autoconf 2.69 for PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1). -# Generated by GNU Autoconf 2.69 for PostgreSQL 9.5beta1. ++# Generated by GNU Autoconf 2.69 for PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1). # -# Report bugs to . +# Report bugs to . # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@@ -582,10 -582,9 +582,10 @@@ MAKEFLAGS # Identity of this package. PACKAGE_NAME='PostgreSQL' PACKAGE_TARNAME='postgresql' - PACKAGE_VERSION='9.5alpha1 (Postgres-XL 9.5alpha1)' - PACKAGE_XC_VERSION='9.5alpha1' - PACKAGE_STRING='PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1)' -PACKAGE_VERSION='9.5beta1' -PACKAGE_STRING='PostgreSQL 9.5beta1' -PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org' ++PACKAGE_VERSION='9.5beta1 (Postgres-XL 9.5beta1)' ++PACKAGE_XC_VERSION='9.5beta1' ++PACKAGE_STRING='PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1)' +PACKAGE_BUGREPORT='bugs@postgres-xl.org' PACKAGE_URL='' ac_unique_file="src/backend/access/common/heaptuple.c" @@@ -1396,7 -1395,7 +1397,7 @@@ if test "$ac_init_help" = "long"; the # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF - \`configure' configures PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1) to adapt to many kinds of systems. -\`configure' configures PostgreSQL 9.5beta1 to adapt to many kinds of systems. ++\`configure' configures PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1) to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@@ -1461,7 -1460,7 +1462,7 @@@ f if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of PostgreSQL 9.5alpha1 (Postgres-XL 9.5alpha1):";; - short | recursive ) echo "Configuration of PostgreSQL 9.5beta1:";; ++ short | recursive ) echo "Configuration of PostgreSQL 9.5beta1 (Postgres-XL 9.5beta1):";; esac cat <<\_ACEOF @@@ -1611,7 -1610,7 +1612,7 @@@ f test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF - PostgreSQL configure 9.5alpha1 (Postgres-XL 9.5alpha1) -PostgreSQL configure 9.5beta1 ++PostgreSQL configure 9.5beta1 (Postgres-XL 9.5beta1) generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@@ -2322,7 -2321,7 +2323,7 @@@ cat >config.log <<_ACEO This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. - It was created by PostgreSQL $as_me 9.5alpha1 (Postgres-XL 9.5alpha1), which was -It was created by PostgreSQL $as_me 9.5beta1, which was ++It was created by PostgreSQL $as_me 9.5beta1 (Postgres-XL 9.5beta1), which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@@ -15507,10 -15582,8 +15598,11 @@@ cat >>confdefs.h <<_ACEO _ACEOF +# For PGXC, set -DPGXC by default. This can be overriden with -UPGXC if the user sets it. +# For Postgres-XL, set both -DPGXC and -DXCP +CFLAGS="-DPGXC -DXCP $CFLAGS" + # Begin output steps { $as_echo "$as_me:${as_lineno-$LINENO}: using compiler=$cc_string" >&5 @@@ -16069,7 -16142,7 +16161,7 @@@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_wr # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" - This file was extended by PostgreSQL $as_me 9.5alpha1 (Postgres-XL 9.5alpha1), which was -This file was extended by PostgreSQL $as_me 9.5beta1, which was ++This file was extended by PostgreSQL $as_me 9.5beta1 (Postgres-XL 9.5beta1), which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@@ -16139,7 -16212,7 +16231,7 @@@ _ACEO cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ - PostgreSQL config.status 9.5alpha1 (Postgres-XL 9.5alpha1) -PostgreSQL config.status 9.5beta1 ++PostgreSQL config.status 9.5beta1 (Postgres-XL 9.5beta1) configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --cc configure.in index 1455d25a5c,55ea5ed9ba..5d6bf5da2c --- a/configure.in +++ b/configure.in @@@ -17,7 -17,7 +17,7 @@@ dnl Read the Autoconf manual for detail dnl m4_pattern_forbid(^PGAC_)dnl to catch undefined macros - AC_INIT([PostgreSQL], [9.5alpha1 (Postgres-XL 9.5alpha1)], [bugs@postgres-xl.org]) -AC_INIT([PostgreSQL], [9.5beta1], [pgsql-bugs@postgresql.org]) ++AC_INIT([PostgreSQL], [9.5beta1 (Postgres-XL 9.5beta1)], [bugs@postgres-xl.org]) m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required. Untested combinations of 'autoconf' and PostgreSQL versions are not diff --cc doc/bug.template index 561f4216a9,8a0f08cfc0..b42afe465c --- a/doc/bug.template +++ b/doc/bug.template @@@ -27,7 -27,7 +27,7 @@@ System Configuration Operating System (example: Linux 2.4.18) : - PostgreSQL version (example: PostgreSQL 9.5alpha1): Postgres-XL 9.5alpha1 - PostgreSQL version (example: PostgreSQL 9.5beta1): PostgreSQL 9.5beta1 ++ PostgreSQL version (example: PostgreSQL 9.5beta1): Postgres-XL 9.5beta1 Compiler used (example: gcc 3.3.5) : diff --cc doc/src/sgml/ddl.sgml index 31ad4b83f5,fe5a076fe1..c69baac498 mode 100755,100644..100755 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml diff --cc doc/src/sgml/ref/alter_table.sgml index e5a677925d,9fea2702ba..53f40d9db8 mode 100755,100644..100755 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml diff --cc doc/src/sgml/ref/create_table.sgml index f75977bed2,a2d0b0cbe1..c001195d56 mode 100755,100644..100755 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml diff --cc src/backend/catalog/Makefile index d272675860,25130ecf12..2680d6ceff --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@@ -39,10 -39,10 +39,11 @@@ POSTGRES_BKI_SRCS = $(addprefix $(top_s pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \ pg_ts_parser.h pg_ts_template.h pg_extension.h \ pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \ + pgxc_class.h pgxc_node.h pgxc_group.h \ pg_foreign_table.h pg_policy.h pg_replication_origin.h \ - pg_tablesample_method.h pg_default_acl.h pg_seclabel.h pg_shseclabel.h \ - pg_collation.h pg_range.h pg_transform.h toasting.h indexing.h \ + pg_default_acl.h pg_seclabel.h pg_shseclabel.h \ + pg_collation.h pg_range.h pg_transform.h \ + toasting.h indexing.h \ ) # location of Catalog.pm diff --cc src/backend/catalog/dependency.c index 1f1a28da2d,90b1cd835f..4397575d8f --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@@ -172,11 -157,9 +172,12 @@@ static const Oid object_classes[] = UserMappingRelationId, /* OCLASS_USER_MAPPING */ DefaultAclRelationId, /* OCLASS_DEFACL */ ExtensionRelationId, /* OCLASS_EXTENSION */ +#ifdef PGXC + PgxcClassRelationId, /* OCLASS_PGXCCLASS */ +#endif EventTriggerRelationId, /* OCLASS_EVENT_TRIGGER */ - PolicyRelationId /* OCLASS_POLICY */ + PolicyRelationId, /* OCLASS_POLICY */ + TransformRelationId /* OCLASS_TRANSFORM */ }; diff --cc src/backend/commands/event_trigger.c index 44d03f8fab,3d1cb0b8e3..44972b6432 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@@ -1166,22 -1166,8 +1166,11 @@@ EventTriggerSupportsObjectClass(ObjectC case OCLASS_USER_MAPPING: case OCLASS_DEFACL: case OCLASS_EXTENSION: +#ifdef PGXC + case OCLASS_PGXC_CLASS: - case OCLASS_PGXC_NODE: - case OCLASS_PGXC_GROUP: +#endif case OCLASS_POLICY: return true; - - case MAX_OCLASS: - - /* - * This shouldn't ever happen, but we keep the case to avoid a - * compiler warning without a "default" clause in the switch. - */ - Assert(false); - break; } return true; diff --cc src/backend/commands/explain.c index 2b1aa00f33,5d06fa4ea6..d173786929 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@@ -1163,76 -1104,6 +1152,73 @@@ ExplainNode(PlanState *planstate, List if (((Scan *) plan)->scanrelid > 0) ExplainScanTarget((Scan *) plan, es); break; - case T_SampleScan: - ExplainScanTarget((Scan *) plan, es); - break; +#ifdef PGXC + case T_RemoteQuery: + /* Emit node execution list */ + ExplainExecNodes(((RemoteQuery *)plan)->exec_nodes, es); + ExplainScanTarget((Scan *) plan, es); + break; +#endif +#ifdef XCP + case T_RemoteSubplan: + { + RemoteSubplan *rsubplan = (RemoteSubplan *) plan; + List *nodeNameList = NIL; + ListCell *lc; + + foreach(lc, rsubplan->nodeList) + { + char *nodename = get_pgxc_nodename( + PGXCNodeGetNodeOid(lfirst_int(lc), + PGXC_NODE_DATANODE)); + nodeNameList = lappend(nodeNameList, nodename); + } + + /* print out destination nodes */ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (nodeNameList) + { + if (es->nodes) + { + bool first = true; + ListCell *lc; + foreach(lc, nodeNameList) + { + char *nodename = (char *) lfirst(lc); + if (first) + { + appendStringInfo(es->str, " on %s (%s", + rsubplan->execOnAll ? "all" : "any", + nodename); + first = false; + } + else + appendStringInfo(es->str, ",%s", nodename); + } + appendStringInfoChar(es->str, ')'); + } + else + { + appendStringInfo(es->str, " on %s", + rsubplan->execOnAll ? "all" : "any"); + } + } + else + { + appendStringInfo(es->str, " on local node"); + } + } + else + { + ExplainPropertyText("Replicated", + rsubplan->execOnAll ? "no" : "yes", + es); + ExplainPropertyList("Node List", nodeNameList, es); + } + } + break; +#endif /* XCP */ case T_IndexScan: { IndexScan *indexscan = (IndexScan *) plan; diff --cc src/backend/commands/tablecmds.c index 4291106a28,5dfdb8dd9a..8e7fc4d17b --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@@ -434,18 -416,9 +436,19 @@@ static ObjectAddress ATExecAddOf(Relati static void ATExecDropOf(Relation rel, LOCKMODE lockmode); static void ATExecReplicaIdentity(Relation rel, ReplicaIdentityStmt *stmt, LOCKMODE lockmode); static void ATExecGenericOptions(Relation rel, List *options); +#ifdef PGXC +static void AtExecDistributeBy(Relation rel, DistributeBy *options); +static void AtExecSubCluster(Relation rel, PGXCSubCluster *options); +static void AtExecAddNode(Relation rel, List *options); +static void AtExecDeleteNode(Relation rel, List *options); +static void ATCheckCmd(Relation rel, AlterTableCmd *cmd); +static RedistribState *BuildRedistribCommands(Oid relid, List *subCmds); +static Oid *delete_node_list(Oid *old_oids, int old_num, Oid *del_oids, int del_num, int *new_num); +static Oid *add_node_list(Oid *old_oids, int old_num, Oid *add_oids, int add_num, int *new_num); +#endif static void ATExecEnableRowSecurity(Relation rel); static void ATExecDisableRowSecurity(Relation rel); + static void ATExecForceNoForceRowSecurity(Relation rel, bool force_rls); static void copy_relation_data(SMgrRelation rel, SMgrRelation dst, ForkNumber forkNum, char relpersistence); diff --cc src/backend/nodes/nodeFuncs.c index 2541494b87,c517dfd9d6..6ed46aedc9 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@@ -2873,24 -2888,13 +2893,18 @@@ range_table_mutator(List *rtable switch (rte->rtekind) { case RTE_RELATION: - if (rte->tablesample) - { - CHECKFLATCOPY(newrte->tablesample, rte->tablesample, - TableSampleClause); - MUTATE(newrte->tablesample->args, - newrte->tablesample->args, - List *); - MUTATE(newrte->tablesample->repeatable, - newrte->tablesample->repeatable, - Node *); - } + MUTATE(newrte->tablesample, rte->tablesample, + TableSampleClause *); + /* we don't bother to copy eref, aliases, etc; OK? */ break; case RTE_CTE: + /* nothing to do */ + break; +#ifdef PGXC + case RTE_REMOTE_DUMMY: - #endif /* PGXC */ - /* we don't bother to copy eref, aliases, etc; OK? */ ++ /* nothing to do */ + break; ++#endif /* PGXC */ case RTE_SUBQUERY: if (!(flags & QTW_IGNORE_RT_SUBQUERIES)) { diff --cc src/backend/nodes/outfuncs.c index 84e25e16fd,991b4c2175..ec4e7eb138 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@@ -540,19 -338,10 +540,19 @@@ _outModifyTable(StringInfo str, const M WRITE_NODE_FIELD(rowMarks); WRITE_INT_FIELD(epqParam); WRITE_ENUM_FIELD(onConflictAction, OnConflictAction); +#ifdef XCP + if (portable_output) + WRITE_RELID_LIST_FIELD(arbiterIndexes); + else + { +#endif WRITE_NODE_FIELD(arbiterIndexes); +#ifdef XCP + } +#endif WRITE_NODE_FIELD(onConflictSet); WRITE_NODE_FIELD(onConflictWhere); - WRITE_INT_FIELD(exclRelRTI); + WRITE_UINT_FIELD(exclRelRTI); WRITE_NODE_FIELD(exclRelTlist); } @@@ -3727,6 -2568,16 +3669,27 @@@ _outRangeTblFunction(StringInfo str, co WRITE_BITMAPSET_FIELD(funcparams); } + static void + _outTableSampleClause(StringInfo str, const TableSampleClause *node) + { + WRITE_NODE_TYPE("TABLESAMPLECLAUSE"); + ++#ifdef XCP ++ if (portable_output) ++ { ++ WRITE_FUNCID_FIELD(tsmhandler); ++ } ++ else ++ { ++#endif + WRITE_OID_FIELD(tsmhandler); ++#ifdef XCP ++ } ++#endif + WRITE_NODE_FIELD(args); + WRITE_NODE_FIELD(repeatable); + } + static void _outAExpr(StringInfo str, const A_Expr *node) { @@@ -4135,11 -2998,9 +4110,14 @@@ _outNode(StringInfo str, const void *ob case T_SeqScan: _outSeqScan(str, obj); break; +#ifdef PGXC + case T_RemoteQuery: + _outRemoteQuery(str, obj); + break; +#endif + case T_SampleScan: + _outSampleScan(str, obj); + break; case T_IndexScan: _outIndexScan(str, obj); break; diff --cc src/backend/nodes/readfuncs.c index 36fe01dc39,32b23fff09..980ccf2ce8 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@@ -2105,1464 -1351,140 +2026,1486 @@@ _readRangeTblFunction(void READ_DONE(); } + /* + * _readTableSampleClause + */ + static TableSampleClause * + _readTableSampleClause(void) + { + READ_LOCALS(TableSampleClause); + ++#ifdef XCP ++ if (portable_input) ++ { ++ READ_FUNCID_FIELD(tsmhandler); ++ } ++ else ++ { ++#endif + READ_OID_FIELD(tsmhandler); ++#ifdef XCP ++ } ++#endif + READ_NODE_FIELD(args); + READ_NODE_FIELD(repeatable); + + READ_DONE(); + } + +#ifdef XCP /* - * parseNodeString - * - * Given a character string representing a node tree, parseNodeString creates - * the internal node structure. - * - * The string to be read must already have been loaded into pg_strtok(). + * _readPlan */ -Node * -parseNodeString(void) +static Plan * +_readPlan(void) { - void *return_value; + READ_PLAN_FIELDS(Plan); - READ_TEMP_LOCALS(); + READ_DONE(); +} - token = pg_strtok(&length); - -#define MATCH(tokname, namelen) \ - (length == namelen && memcmp(token, tokname, namelen) == 0) +/* + * _readResult + */ +static Result * +_readResult(void) +{ + READ_PLAN_FIELDS(Result); - if (MATCH("QUERY", 5)) - return_value = _readQuery(); - else if (MATCH("WITHCHECKOPTION", 15)) - return_value = _readWithCheckOption(); - else if (MATCH("SORTGROUPCLAUSE", 15)) - return_value = _readSortGroupClause(); - else if (MATCH("GROUPINGSET", 11)) - return_value = _readGroupingSet(); - else if (MATCH("WINDOWCLAUSE", 12)) - return_value = _readWindowClause(); - else if (MATCH("ROWMARKCLAUSE", 13)) - return_value = _readRowMarkClause(); - else if (MATCH("COMMONTABLEEXPR", 15)) - return_value = _readCommonTableExpr(); - else if (MATCH("SETOPERATIONSTMT", 16)) - return_value = _readSetOperationStmt(); - else if (MATCH("ALIAS", 5)) - return_value = _readAlias(); - else if (MATCH("RANGEVAR", 8)) - return_value = _readRangeVar(); - else if (MATCH("INTOCLAUSE", 10)) - return_value = _readIntoClause(); - else if (MATCH("VAR", 3)) - return_value = _readVar(); - else if (MATCH("CONST", 5)) - return_value = _readConst(); - else if (MATCH("PARAM", 5)) - return_value = _readParam(); - else if (MATCH("AGGREF", 6)) - return_value = _readAggref(); - else if (MATCH("GROUPINGFUNC", 12)) - return_value = _readGroupingFunc(); - else if (MATCH("WINDOWFUNC", 10)) - return_value = _readWindowFunc(); - else if (MATCH("ARRAYREF", 8)) - return_value = _readArrayRef(); - else if (MATCH("FUNCEXPR", 8)) - return_value = _readFuncExpr(); - else if (MATCH("NAMEDARGEXPR", 12)) - return_value = _readNamedArgExpr(); - else if (MATCH("OPEXPR", 6)) - return_value = _readOpExpr(); - else if (MATCH("DISTINCTEXPR", 12)) - return_value = _readDistinctExpr(); - else if (MATCH("NULLIFEXPR", 10)) - return_value = _readNullIfExpr(); - else if (MATCH("SCALARARRAYOPEXPR", 17)) - return_value = _readScalarArrayOpExpr(); - else if (MATCH("BOOLEXPR", 8)) - return_value = _readBoolExpr(); - else if (MATCH("SUBLINK", 7)) - return_value = _readSubLink(); - else if (MATCH("FIELDSELECT", 11)) - return_value = _readFieldSelect(); - else if (MATCH("FIELDSTORE", 10)) - return_value = _readFieldStore(); - else if (MATCH("RELABELTYPE", 11)) - return_value = _readRelabelType(); - else if (MATCH("COERCEVIAIO", 11)) - return_value = _readCoerceViaIO(); - else if (MATCH("ARRAYCOERCEEXPR", 15)) - return_value = _readArrayCoerceExpr(); - else if (MATCH("CONVERTROWTYPEEXPR", 18)) - return_value = _readConvertRowtypeExpr(); - else if (MATCH("COLLATE", 7)) - return_value = _readCollateExpr(); - else if (MATCH("CASE", 4)) - return_value = _readCaseExpr(); - else if (MATCH("WHEN", 4)) - return_value = _readCaseWhen(); - else if (MATCH("CASETESTEXPR", 12)) - return_value = _readCaseTestExpr(); - else if (MATCH("ARRAY", 5)) - return_value = _readArrayExpr(); - else if (MATCH("ROW", 3)) - return_value = _readRowExpr(); - else if (MATCH("ROWCOMPARE", 10)) - return_value = _readRowCompareExpr(); - else if (MATCH("COALESCE", 8)) - return_value = _readCoalesceExpr(); - else if (MATCH("MINMAX", 6)) - return_value = _readMinMaxExpr(); - else if (MATCH("XMLEXPR", 7)) - return_value = _readXmlExpr(); - else if (MATCH("NULLTEST", 8)) - return_value = _readNullTest(); - else if (MATCH("BOOLEANTEST", 11)) - return_value = _readBooleanTest(); - else if (MATCH("COERCETODOMAIN", 14)) - return_value = _readCoerceToDomain(); - else if (MATCH("COERCETODOMAINVALUE", 19)) - return_value = _readCoerceToDomainValue(); - else if (MATCH("SETTODEFAULT", 12)) - return_value = _readSetToDefault(); - else if (MATCH("CURRENTOFEXPR", 13)) - return_value = _readCurrentOfExpr(); - else if (MATCH("INFERENCEELEM", 13)) - return_value = _readInferenceElem(); + READ_NODE_FIELD(resconstantqual); + + READ_DONE(); +} + + +/* + * _readModifyTable + */ +static ModifyTable * +_readModifyTable(void) +{ + READ_PLAN_FIELDS(ModifyTable); + + READ_ENUM_FIELD(operation, CmdType); + READ_BOOL_FIELD(canSetTag); + READ_UINT_FIELD(nominalRelation); + READ_NODE_FIELD(resultRelations); + READ_INT_FIELD(resultRelIndex); + READ_NODE_FIELD(plans); + READ_NODE_FIELD(withCheckOptionLists); + READ_NODE_FIELD(returningLists); + READ_NODE_FIELD(fdwPrivLists); + READ_NODE_FIELD(rowMarks); + READ_INT_FIELD(epqParam); + READ_ENUM_FIELD(onConflictAction, OnConflictAction); +#ifdef XCP + if (portable_input) + READ_RELID_LIST_FIELD(arbiterIndexes); + else +#endif + READ_NODE_FIELD(arbiterIndexes); + READ_NODE_FIELD(onConflictSet); + READ_NODE_FIELD(onConflictWhere); + READ_INT_FIELD(exclRelRTI); + READ_NODE_FIELD(exclRelTlist); + + READ_DONE(); +} + + +/* + * _readAppend + */ +static Append * +_readAppend(void) +{ + READ_PLAN_FIELDS(Append); + + READ_NODE_FIELD(appendplans); + + READ_DONE(); +} + + +/* + * _readMergeAppend + */ +static MergeAppend * +_readMergeAppend(void) +{ + int i; + READ_PLAN_FIELDS(MergeAppend); + + READ_NODE_FIELD(mergeplans); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :sortColIdx */ + local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->sortColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :sortOperators */ + local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->sortOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->sortOperators[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :collations */ + local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->collations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->collations[i] = InvalidOid; + } + else + local_node->collations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :nullsFirst */ + local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->nullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readRecursiveUnion + */ +static RecursiveUnion * +_readRecursiveUnion(void) +{ + int i; + READ_PLAN_FIELDS(RecursiveUnion); + + READ_INT_FIELD(wtParam); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :dupColIdx */ + local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :dupOperators */ + local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupOperators[i] = atooid(token); + } + + READ_LONG_FIELD(numGroups); + + READ_DONE(); +} + + +/* + * _readBitmapAnd + */ +static BitmapAnd * +_readBitmapAnd(void) +{ + READ_PLAN_FIELDS(BitmapAnd); + + READ_NODE_FIELD(bitmapplans); + + READ_DONE(); +} + + +/* + * _readBitmapOr + */ +static BitmapOr * +_readBitmapOr(void) +{ + READ_PLAN_FIELDS(BitmapOr); + + READ_NODE_FIELD(bitmapplans); + + READ_DONE(); +} + + +/* + * _readScan + */ +static Scan * +_readScan(void) +{ + READ_SCAN_FIELDS(Scan); + + READ_DONE(); +} + + +/* + * _readSeqScan + */ +static SeqScan * +_readSeqScan(void) +{ + READ_SCAN_FIELDS(SeqScan); + + READ_DONE(); +} + +/* + * _readSampleScan + */ +static SampleScan * +_readSampleScan(void) +{ + READ_SCAN_FIELDS(SampleScan); ++ READ_NODE_FIELD(tablesample); + + READ_DONE(); +} + +/* + * _readIndexScan + */ +static IndexScan * +_readIndexScan(void) +{ + READ_SCAN_FIELDS(IndexScan); + + if (portable_input) + READ_RELID_FIELD(indexid); + else + READ_OID_FIELD(indexid); + READ_NODE_FIELD(indexqual); + READ_NODE_FIELD(indexqualorig); + READ_NODE_FIELD(indexorderby); + READ_NODE_FIELD(indexorderbyorig); + READ_NODE_FIELD(indexorderbyops); + READ_ENUM_FIELD(indexorderdir, ScanDirection); + + READ_DONE(); +} + + +/* + * _readIndexOnlyScan + */ +static IndexOnlyScan * +_readIndexOnlyScan(void) +{ + READ_SCAN_FIELDS(IndexOnlyScan); + + if (portable_input) + READ_RELID_FIELD(indexid); + else + READ_OID_FIELD(indexid); + READ_NODE_FIELD(indexqual); + READ_NODE_FIELD(indexorderby); + READ_NODE_FIELD(indextlist); + READ_ENUM_FIELD(indexorderdir, ScanDirection); + + READ_DONE(); +} + + +/* + * _readBitmapIndexScan + */ +static BitmapIndexScan * +_readBitmapIndexScan(void) +{ + READ_SCAN_FIELDS(BitmapIndexScan); + + if (portable_input) + READ_RELID_FIELD(indexid); + else + READ_OID_FIELD(indexid); + READ_NODE_FIELD(indexqual); + READ_NODE_FIELD(indexqualorig); + + READ_DONE(); +} + + +/* + * _readBitmapHeapScan + */ +static BitmapHeapScan * +_readBitmapHeapScan(void) +{ + READ_SCAN_FIELDS(BitmapHeapScan); + + READ_NODE_FIELD(bitmapqualorig); + + READ_DONE(); +} + + +/* + * _readTidScan + */ +static TidScan * +_readTidScan(void) +{ + READ_SCAN_FIELDS(TidScan); + + READ_NODE_FIELD(tidquals); + + READ_DONE(); +} + + +/* + * _readSubqueryScan + */ +static SubqueryScan * +_readSubqueryScan(void) +{ + READ_SCAN_FIELDS(SubqueryScan); + + READ_NODE_FIELD(subplan); + + READ_DONE(); +} + + +/* + * _readFunctionScan + */ +static FunctionScan * +_readFunctionScan(void) +{ + READ_SCAN_FIELDS(FunctionScan); + + READ_NODE_FIELD(functions); + READ_BOOL_FIELD(funcordinality); + + READ_DONE(); +} + + +/* + * _readValuesScan + */ +static ValuesScan * +_readValuesScan(void) +{ + READ_SCAN_FIELDS(ValuesScan); + + READ_NODE_FIELD(values_lists); + + READ_DONE(); +} + + +/* + * _readCteScan + */ +static CteScan * +_readCteScan(void) +{ + READ_SCAN_FIELDS(CteScan); + + READ_INT_FIELD(ctePlanId); + READ_INT_FIELD(cteParam); + + READ_DONE(); +} + + +/* + * _readWorkTableScan + */ +static WorkTableScan * +_readWorkTableScan(void) +{ + READ_SCAN_FIELDS(WorkTableScan); + + READ_INT_FIELD(wtParam); + + READ_DONE(); +} + + +/* + * _readJoin + */ +static Join * +_readJoin(void) +{ + READ_JOIN_FIELDS(Join); + + READ_DONE(); +} + + +/* + * _readNestLoop + */ +static NestLoop * +_readNestLoop(void) +{ + READ_JOIN_FIELDS(NestLoop); + + READ_NODE_FIELD(nestParams); + + READ_DONE(); +} + + +/* + * _readMergeJoin + */ +static MergeJoin * +_readMergeJoin(void) +{ + int numCols; + int i; + READ_JOIN_FIELDS(MergeJoin); + + READ_NODE_FIELD(mergeclauses); + numCols = list_length(local_node->mergeclauses); + + + token = pg_strtok(&length); /* skip :mergeFamilies */ + local_node->mergeFamilies = (Oid *) palloc(numCols * sizeof(Oid)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + local_node->mergeFamilies[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :mergeCollations */ + local_node->mergeCollations = (Oid *) palloc(numCols * sizeof(Oid)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->mergeCollations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->mergeCollations[i] = InvalidOid; + } + else + local_node->mergeCollations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :mergeStrategies */ + local_node->mergeStrategies = (int *) palloc(numCols * sizeof(int)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + local_node->mergeStrategies[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :mergeNullsFirst */ + local_node->mergeNullsFirst = (bool *) palloc(numCols * sizeof(bool)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + local_node->mergeNullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readHashJoin + */ +static HashJoin * +_readHashJoin(void) +{ + READ_JOIN_FIELDS(HashJoin); + + READ_NODE_FIELD(hashclauses); + + READ_DONE(); +} + + +/* + * _readMaterial + */ +static Material * +_readMaterial(void) +{ + READ_PLAN_FIELDS(Material); + + READ_DONE(); +} + + +/* + * _readSort + */ +static Sort * +_readSort(void) +{ + int i; + READ_PLAN_FIELDS(Sort); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :sortColIdx */ + local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->sortColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :sortOperators */ + local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->sortOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->sortOperators[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :collations */ + local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->collations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->collations[i] = InvalidOid; + } + else + local_node->collations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :nullsFirst */ + local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->nullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readGroup + */ +static Group * +_readGroup(void) +{ + int i; + READ_PLAN_FIELDS(Group); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :grpColIdx */ + local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->grpColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :grpOperators */ + local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->grpOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->grpOperators[i] = atooid(token); + } + + READ_DONE(); +} + + +/* + * _readAgg + */ +static Agg * +_readAgg(void) +{ + int i; + READ_PLAN_FIELDS(Agg); + + READ_ENUM_FIELD(aggstrategy, AggStrategy); + READ_ENUM_FIELD(aggdistribution, AggDistribution); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :grpColIdx */ + local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->grpColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :grpOperators */ + local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->grpOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->grpOperators[i] = atooid(token); + } + + READ_LONG_FIELD(numGroups); + + READ_NODE_FIELD(groupingSets); + READ_NODE_FIELD(chain); + + READ_DONE(); +} + + +/* + * _readWindowAgg + */ +static WindowAgg * +_readWindowAgg(void) +{ + int i; + READ_PLAN_FIELDS(WindowAgg); + + READ_INT_FIELD(winref); + READ_INT_FIELD(partNumCols); + + token = pg_strtok(&length); /* skip :partColIdx */ + local_node->partColIdx = (AttrNumber *) palloc(local_node->partNumCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->partNumCols; i++) + { + token = pg_strtok(&length); + local_node->partColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :partOperators */ + local_node->partOperators = (Oid *) palloc(local_node->partNumCols * sizeof(Oid)); + for (i = 0; i < local_node->partNumCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->partOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->partOperators[i] = atooid(token); + } + + READ_INT_FIELD(ordNumCols); + + token = pg_strtok(&length); /* skip :ordColIdx */ + local_node->ordColIdx = (AttrNumber *) palloc(local_node->ordNumCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->ordNumCols; i++) + { + token = pg_strtok(&length); + local_node->ordColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :ordOperators */ + local_node->ordOperators = (Oid *) palloc(local_node->ordNumCols * sizeof(Oid)); + for (i = 0; i < local_node->ordNumCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->ordOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->ordOperators[i] = atooid(token); + } + + READ_INT_FIELD(frameOptions); + READ_NODE_FIELD(startOffset); + READ_NODE_FIELD(endOffset); + + READ_DONE(); +} + + +/* + * _readUnique + */ +static Unique * +_readUnique(void) +{ + int i; + READ_PLAN_FIELDS(Unique); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :uniqColIdx */ + local_node->uniqColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->uniqColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :uniqOperators */ + local_node->uniqOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->uniqOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->uniqOperators[i] = atooid(token); + } + + READ_DONE(); +} + + +/* + * _readHash + */ +static Hash * +_readHash(void) +{ + READ_PLAN_FIELDS(Hash); + + if (portable_input) + READ_RELID_FIELD(skewTable); + else + READ_OID_FIELD(skewTable); + READ_INT_FIELD(skewColumn); + READ_BOOL_FIELD(skewInherit); + if (portable_input) + READ_TYPID_FIELD(skewColType); + else + READ_OID_FIELD(skewColType); + READ_INT_FIELD(skewColTypmod); + + READ_DONE(); +} + + +/* + * _readSetOp + */ +static SetOp * +_readSetOp(void) +{ + int i; + READ_PLAN_FIELDS(SetOp); + + READ_ENUM_FIELD(cmd, SetOpCmd); + READ_ENUM_FIELD(strategy, SetOpStrategy); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :dupColIdx */ + local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :dupOperators */ + local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupOperators[i] = atooid(token); + } + + READ_INT_FIELD(flagColIdx); + READ_INT_FIELD(firstFlag); + READ_LONG_FIELD(numGroups); + + READ_DONE(); +} + + +/* + * _readLimit + */ +static Limit * +_readLimit(void) +{ + READ_PLAN_FIELDS(Limit); + + READ_NODE_FIELD(limitOffset); + READ_NODE_FIELD(limitCount); + + READ_DONE(); +} + + +/* + * _readRemoteSubplan + */ +static RemoteSubplan * +_readRemoteSubplan(void) +{ + READ_SCAN_FIELDS(RemoteSubplan); + + READ_CHAR_FIELD(distributionType); + READ_INT_FIELD(distributionKey); + READ_NODE_FIELD(distributionNodes); + READ_NODE_FIELD(distributionRestrict); + READ_NODE_FIELD(nodeList); + READ_BOOL_FIELD(execOnAll); + READ_NODE_FIELD(sort); + READ_STRING_FIELD(cursor); + READ_INT_FIELD(unique); + + READ_DONE(); +} + + +/* + * _readRemoteStmt + */ +static RemoteStmt * +_readRemoteStmt(void) +{ + int i; + READ_LOCALS(RemoteStmt); + + READ_ENUM_FIELD(commandType, CmdType); + READ_BOOL_FIELD(hasReturning); + READ_NODE_FIELD(planTree); + READ_NODE_FIELD(rtable); + READ_NODE_FIELD(resultRelations); + READ_NODE_FIELD(subplans); + READ_INT_FIELD(nParamExec); + READ_INT_FIELD(nParamRemote); + if (local_node->nParamRemote > 0) + { + local_node->remoteparams = (RemoteParam *) palloc( + local_node->nParamRemote * sizeof(RemoteParam)); + for (i = 0; i < local_node->nParamRemote; i++) + { + RemoteParam *rparam = &(local_node->remoteparams[i]); + token = pg_strtok(&length); /* skip :paramkind */ + token = pg_strtok(&length); + rparam->paramkind = (ParamKind) atoi(token); + + token = pg_strtok(&length); /* skip :paramid */ + token = pg_strtok(&length); + rparam->paramid = atoi(token); + + token = pg_strtok(&length); /* skip :paramtype */ + if (portable_input) + { + char *nspname; /* namespace name */ + char *typname; /* data type name */ + token = pg_strtok(&length); /* get nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get typname */ + typname = nullable_string(token, length); + if (typname) + rparam->paramtype = get_typname_typid(typname, + NSP_OID(nspname)); + else + rparam->paramtype = InvalidOid; + } + else + { + token = pg_strtok(&length); + rparam->paramtype = atooid(token); + } + } + } + else + local_node->remoteparams = NULL; + + READ_NODE_FIELD(rowMarks); + READ_CHAR_FIELD(distributionType); + READ_INT_FIELD(distributionKey); + READ_NODE_FIELD(distributionNodes); + READ_NODE_FIELD(distributionRestrict); + + READ_DONE(); +} + + +/* + * _readSimpleSort + */ +static SimpleSort * +_readSimpleSort(void) +{ + int i; + READ_LOCALS(SimpleSort); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :sortColIdx */ + local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->sortColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :sortOperators */ + local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->sortOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->sortOperators[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :sortCollations */ + local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->sortCollations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->sortCollations[i] = InvalidOid; + } + else + local_node->sortCollations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :nullsFirst */ + local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->nullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readNestLoopParam + */ +static NestLoopParam * +_readNestLoopParam(void) +{ + READ_LOCALS(NestLoopParam); + + READ_INT_FIELD(paramno); + READ_NODE_FIELD(paramval); + + READ_DONE(); +} + + +/* + * _readPlanRowMark + */ +static PlanRowMark * +_readPlanRowMark(void) +{ + READ_LOCALS(PlanRowMark); + + READ_UINT_FIELD(rti); + READ_UINT_FIELD(prti); + READ_UINT_FIELD(rowmarkId); + READ_ENUM_FIELD(markType, RowMarkType); + READ_INT_FIELD(allMarkTypes); + READ_ENUM_FIELD(strength, LockClauseStrength); + READ_ENUM_FIELD(waitPolicy, LockWaitPolicy); + READ_BOOL_FIELD(isParent); + + READ_DONE(); +} + +/* + * _readLockRows + */ +static LockRows * +_readLockRows(void) +{ + READ_PLAN_FIELDS(LockRows); + + READ_NODE_FIELD(rowMarks); + READ_INT_FIELD(epqParam); + + READ_DONE(); +} + +#endif /* XCP */ + + +/* + * parseNodeString + * + * Given a character string representing a node tree, parseNodeString creates + * the internal node structure. + * + * The string to be read must already have been loaded into pg_strtok(). + */ +Node * +parseNodeString(void) +{ + void *return_value; + + READ_TEMP_LOCALS(); + + token = pg_strtok(&length); + +#define MATCH(tokname, namelen) \ + (length == namelen && memcmp(token, tokname, namelen) == 0) + + if (MATCH("QUERY", 5)) + return_value = _readQuery(); + else if (MATCH("WITHCHECKOPTION", 15)) + return_value = _readWithCheckOption(); + else if (MATCH("SORTGROUPCLAUSE", 15)) + return_value = _readSortGroupClause(); + else if (MATCH("GROUPINGSET", 11)) + return_value = _readGroupingSet(); + else if (MATCH("WINDOWCLAUSE", 12)) + return_value = _readWindowClause(); + else if (MATCH("ROWMARKCLAUSE", 13)) + return_value = _readRowMarkClause(); + else if (MATCH("COMMONTABLEEXPR", 15)) + return_value = _readCommonTableExpr(); - else if (MATCH("RANGETABLESAMPLE", 16)) - return_value = _readRangeTableSample(); - else if (MATCH("TABLESAMPLECLAUSE", 17)) - return_value = _readTableSampleClause(); + else if (MATCH("SETOPERATIONSTMT", 16)) + return_value = _readSetOperationStmt(); + else if (MATCH("ALIAS", 5)) + return_value = _readAlias(); + else if (MATCH("RANGEVAR", 8)) + return_value = _readRangeVar(); + else if (MATCH("INTOCLAUSE", 10)) + return_value = _readIntoClause(); + else if (MATCH("VAR", 3)) + return_value = _readVar(); + else if (MATCH("CONST", 5)) + return_value = _readConst(); + else if (MATCH("PARAM", 5)) + return_value = _readParam(); + else if (MATCH("AGGREF", 6)) + return_value = _readAggref(); + else if (MATCH("GROUPINGFUNC", 12)) + return_value = _readGroupingFunc(); + else if (MATCH("WINDOWFUNC", 10)) + return_value = _readWindowFunc(); + else if (MATCH("ARRAYREF", 8)) + return_value = _readArrayRef(); + else if (MATCH("FUNCEXPR", 8)) + return_value = _readFuncExpr(); + else if (MATCH("NAMEDARGEXPR", 12)) + return_value = _readNamedArgExpr(); + else if (MATCH("OPEXPR", 6)) + return_value = _readOpExpr(); + else if (MATCH("DISTINCTEXPR", 12)) + return_value = _readDistinctExpr(); + else if (MATCH("NULLIFEXPR", 10)) + return_value = _readNullIfExpr(); + else if (MATCH("SCALARARRAYOPEXPR", 17)) + return_value = _readScalarArrayOpExpr(); + else if (MATCH("BOOLEXPR", 8)) + return_value = _readBoolExpr(); + else if (MATCH("SUBLINK", 7)) + return_value = _readSubLink(); +#ifdef XCP + else if (MATCH("SUBPLAN", 7)) + return_value = _readSubPlan(); +#endif + else if (MATCH("FIELDSELECT", 11)) + return_value = _readFieldSelect(); + else if (MATCH("FIELDSTORE", 10)) + return_value = _readFieldStore(); + else if (MATCH("RELABELTYPE", 11)) + return_value = _readRelabelType(); + else if (MATCH("COERCEVIAIO", 11)) + return_value = _readCoerceViaIO(); + else if (MATCH("ARRAYCOERCEEXPR", 15)) + return_value = _readArrayCoerceExpr(); + else if (MATCH("CONVERTROWTYPEEXPR", 18)) + return_value = _readConvertRowtypeExpr(); + else if (MATCH("COLLATE", 7)) + return_value = _readCollateExpr(); + else if (MATCH("CASE", 4)) + return_value = _readCaseExpr(); + else if (MATCH("WHEN", 4)) + return_value = _readCaseWhen(); + else if (MATCH("CASETESTEXPR", 12)) + return_value = _readCaseTestExpr(); + else if (MATCH("ARRAY", 5)) + return_value = _readArrayExpr(); + else if (MATCH("ROW", 3)) + return_value = _readRowExpr(); + else if (MATCH("ROWCOMPARE", 10)) + return_value = _readRowCompareExpr(); + else if (MATCH("COALESCE", 8)) + return_value = _readCoalesceExpr(); + else if (MATCH("MINMAX", 6)) + return_value = _readMinMaxExpr(); + else if (MATCH("XMLEXPR", 7)) + return_value = _readXmlExpr(); + else if (MATCH("NULLTEST", 8)) + return_value = _readNullTest(); + else if (MATCH("BOOLEANTEST", 11)) + return_value = _readBooleanTest(); + else if (MATCH("COERCETODOMAIN", 14)) + return_value = _readCoerceToDomain(); + else if (MATCH("COERCETODOMAINVALUE", 19)) + return_value = _readCoerceToDomainValue(); + else if (MATCH("SETTODEFAULT", 12)) + return_value = _readSetToDefault(); + else if (MATCH("CURRENTOFEXPR", 13)) + return_value = _readCurrentOfExpr(); + else if (MATCH("INFERENCEELEM", 13)) + return_value = _readInferenceElem(); else if (MATCH("TARGETENTRY", 11)) return_value = _readTargetEntry(); else if (MATCH("RANGETBLREF", 11)) diff --cc src/backend/optimizer/path/allpaths.c index 91797b8f68,8fc1cfd15f..4f3996ea32 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@@ -22,8 -17,8 +22,9 @@@ #include +#include "catalog/pg_namespace.h" #include "access/sysattr.h" + #include "access/tsmapi.h" #include "catalog/pg_class.h" #include "catalog/pg_operator.h" #include "foreign/fdwapi.h" diff --cc src/backend/optimizer/prep/prepjointree.c index 121393702a,9bf1c662b5..d9c314ef22 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@@ -1100,12 -1100,8 +1104,11 @@@ pull_up_simple_subquery(PlannerInfo *ro case RTE_VALUES: child_rte->lateral = true; break; - case RTE_RELATION: case RTE_JOIN: case RTE_CTE: +#ifdef XCP + case RTE_REMOTE_DUMMY: +#endif /* these can't contain any lateral references */ break; } @@@ -1931,12 -1950,8 +1957,11 @@@ replace_vars_in_jointree(Node *jtnode pullup_replace_vars((Node *) rte->values_lists, context); break; - case RTE_RELATION: case RTE_JOIN: case RTE_CTE: +#ifdef XCP + case RTE_REMOTE_DUMMY: +#endif /* these shouldn't be marked LATERAL */ Assert(false); break; diff --cc src/backend/optimizer/util/pathnode.c index 027d28e261,4336ca1b78..b4a722e027 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@@ -1714,20 -726,7 +1714,20 @@@ create_samplescan_path(PlannerInfo *roo required_outer); pathnode->pathkeys = NIL; /* samplescan has unordered result */ +#ifdef XCP + set_scanpath_distribution(root, rel, pathnode); + if (rel->baserestrictinfo) + { + ListCell *lc; + foreach (lc, rel->baserestrictinfo) + { + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + restrict_distribution(root, ri, pathnode); + } + } +#endif + - cost_samplescan(pathnode, root, rel); + cost_samplescan(pathnode, root, rel, pathnode->param_info); return pathnode; } @@@ -3051,15 -1805,8 +3053,13 @@@ reparameterize_path(PlannerInfo *root, loop_count); } case T_SubqueryScan: +#ifdef XCP + return create_subqueryscan_path(root, rel, path->pathkeys, + required_outer, path->distribution); +#else return create_subqueryscan_path(root, rel, path->pathkeys, required_outer); +#endif - case T_SampleScan: - return (Path *) create_samplescan_path(root, rel, required_outer); default: break; } diff --cc src/backend/optimizer/util/pgxcship.c index db0a38a957,0000000000..a19ce71866 mode 100644,000000..100644 --- a/src/backend/optimizer/util/pgxcship.c +++ b/src/backend/optimizer/util/pgxcship.c @@@ -1,2023 -1,0 +1,2027 @@@ +/*------------------------------------------------------------------------- + * + * pgxcship.c + * Routines to evaluate expression shippability to remote nodes + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2012, Postgres-XC Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/optimizer/util/pgxcship.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "catalog/pg_class.h" +#include "catalog/pg_inherits_fn.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_proc.h" +#ifdef PGXC +#include "catalog/pg_trigger.h" +#endif +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "commands/trigger.h" +#include "nodes/nodeFuncs.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "optimizer/pgxcplan.h" +#include "optimizer/pgxcship.h" +#include "optimizer/tlist.h" +#include "parser/parsetree.h" +#include "parser/parse_coerce.h" +#include "parser/parse_type.h" +#include "pgxc/locator.h" +#include "pgxc/pgxcnode.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" + + +/* + * Shippability_context + * This context structure is used by the Fast Query Shipping walker, to gather + * information during analysing query for Fast Query Shipping. + */ +typedef struct +{ + bool sc_for_expr; /* if false, the we are checking shippability + * of the Query, otherwise, we are checking + * shippability of a stand-alone expression. + */ + Bitmapset *sc_shippability; /* The conditions for (un)shippability of the + * query. + */ + Query *sc_query; /* the query being analysed for FQS */ + int sc_query_level; /* level of the query */ + int sc_max_varlevelsup; /* maximum upper level referred to by any + * variable reference in the query. If this + * value is greater than 0, the query is not + * shippable, if shipped alone. + */ + ExecNodes *sc_exec_nodes; /* nodes where the query should be executed */ + ExecNodes *sc_subquery_en; /* ExecNodes produced by merging the ExecNodes + * for individual subqueries. This gets + * ultimately merged with sc_exec_nodes. + */ + bool sc_groupby_has_distcol; /* GROUP BY clause has distribution column */ +} Shippability_context; + +/* + * ShippabilityStat + * List of reasons why a query/expression is not shippable to remote nodes. + */ +typedef enum +{ + SS_UNSHIPPABLE_EXPR = 0, /* it has unshippable expression */ + SS_NEED_SINGLENODE, /* Has expressions which can be evaluated when + * there is only a single node involved. + * Athought aggregates too fit in this class, we + * have a separate status to report aggregates, + * see below. + */ + SS_NEEDS_COORD, /* the query needs Coordinator */ + SS_VARLEVEL, /* one of its subqueries has a VAR + * referencing an upper level query + * relation + */ + SS_NO_NODES, /* no suitable nodes can be found to ship + * the query + */ + SS_UNSUPPORTED_EXPR, /* it has expressions currently unsupported + * by FQS, but such expressions might be + * supported by FQS in future + */ + SS_HAS_AGG_EXPR, /* it has aggregate expressions */ + SS_UNSHIPPABLE_TYPE, /* the type of expression is unshippable */ + SS_UNSHIPPABLE_TRIGGER, /* the type of trigger is unshippable */ + SS_UPDATES_DISTRIBUTION_COLUMN /* query updates the distribution column */ +} ShippabilityStat; + +/* Manipulation of shippability reason */ +static bool pgxc_test_shippability_reason(Shippability_context *context, + ShippabilityStat reason); +static void pgxc_set_shippability_reason(Shippability_context *context, + ShippabilityStat reason); +static void pgxc_reset_shippability_reason(Shippability_context *context, + ShippabilityStat reason); + +/* Evaluation of shippability */ +static bool pgxc_shippability_walker(Node *node, Shippability_context *sc_context); +static void pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context); + +/* Fast-query shipping (FQS) functions */ +static ExecNodes *pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, + Index varno, + Query *query); +static ExecNodes *pgxc_FQS_find_datanodes(Query *query); +static bool pgxc_query_needs_coord(Query *query); +static bool pgxc_query_contains_only_pg_catalog(List *rtable); +static bool pgxc_is_var_distrib_column(Var *var, List *rtable); +static bool pgxc_distinct_has_distcol(Query *query); +static bool pgxc_targetlist_has_distcol(Query *query); +static ExecNodes *pgxc_FQS_find_datanodes_recurse(Node *node, Query *query, + Bitmapset **relids); +static ExecNodes *pgxc_FQS_datanodes_for_rtr(Index varno, Query *query); + +/* + * Set the given reason in Shippability_context indicating why the query can not be + * shipped directly to remote nodes. + */ +static void +pgxc_set_shippability_reason(Shippability_context *context, ShippabilityStat reason) +{ + context->sc_shippability = bms_add_member(context->sc_shippability, reason); +} + +/* + * pgxc_reset_shippability_reason + * Reset reason why the query cannot be shipped to remote nodes + */ +static void +pgxc_reset_shippability_reason(Shippability_context *context, ShippabilityStat reason) +{ + context->sc_shippability = bms_del_member(context->sc_shippability, reason); + return; +} + + +/* + * See if a given reason is why the query can not be shipped directly + * to the remote nodes. + */ +static bool +pgxc_test_shippability_reason(Shippability_context *context, ShippabilityStat reason) +{ + return bms_is_member(reason, context->sc_shippability); +} + + +/* + * pgxc_set_exprtype_shippability + * Set the expression type shippability. For now composite types + * derived from view definitions are not shippable. + */ +static void +pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context) +{ + char typerelkind; + + typerelkind = get_rel_relkind(typeidTypeRelid(exprtype)); + + if (typerelkind == RELKIND_SEQUENCE || + typerelkind == RELKIND_VIEW || + typerelkind == RELKIND_FOREIGN_TABLE) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_TYPE); +} + +/* + * pgxc_FQS_datanodes_for_rtr + * For a given RangeTblRef find the datanodes where corresponding data is + * located. + */ +static ExecNodes * +pgxc_FQS_datanodes_for_rtr(Index varno, Query *query) +{ + RangeTblEntry *rte = rt_fetch(varno, query->rtable); + switch (rte->rtekind) + { + case RTE_RELATION: + { + /* For anything, other than a table, we can't find the datanodes */ + if (rte->relkind != RELKIND_RELATION) + return NULL; + /* + * In case of inheritance, child tables can have completely different + * Datanode distribution than parent. To handle inheritance we need + * to merge the Datanodes of the children table as well. The inheritance + * is resolved during planning, so we may not have the RTEs of the + * children here. Also, the exact method of merging Datanodes of the + * children is not known yet. So, when inheritance is requested, query + * can not be shipped. + * See prologue of has_subclass, we might miss on the optimization + * because has_subclass can return true even if there aren't any + * subclasses, but it's ok. + */ + if (rte->inh && has_subclass(rte->relid)) + return NULL; + + return pgxc_FQS_get_relation_nodes(rte, varno, query); + } + break; + + /* For any other type of RTE, we return NULL for now */ + case RTE_JOIN: + case RTE_CTE: + case RTE_SUBQUERY: + case RTE_FUNCTION: + case RTE_VALUES: + default: + return NULL; + } +} + +/* + * pgxc_FQS_find_datanodes_recurse + * Recursively find whether the sub-tree of From Expr rooted under given node is + * pushable and if yes where. + */ +static ExecNodes * +pgxc_FQS_find_datanodes_recurse(Node *node, Query *query, Bitmapset **relids) +{ + List *query_rtable = query->rtable; + + if (!node) + return NULL; + + switch(nodeTag(node)) + { + case T_FromExpr: + { + FromExpr *from_expr = (FromExpr *)node; + ListCell *lcell; + bool first; + Bitmapset *from_relids; + ExecNodes *result_en; + + /* + * For INSERT commands, we won't have any entries in the from list. + * Get the datanodes using the resultRelation index. + */ + if (query->commandType != CMD_SELECT && !from_expr->fromlist) + { + *relids = bms_make_singleton(query->resultRelation); + return pgxc_FQS_datanodes_for_rtr(query->resultRelation, + query); + } + + /* + * All the entries in the From list are considered to be INNER + * joined with the quals as the JOIN condition. Get the datanodes + * for the first entry in the From list. For every subsequent entry + * determine whether the join between the relation in that entry and + * the cumulative JOIN of previous entries can be pushed down to the + * datanodes and the corresponding set of datanodes where the join + * can be pushed down. + */ + first = true; + result_en = NULL; + from_relids = NULL; + foreach (lcell, from_expr->fromlist) + { + Node *fromlist_entry = lfirst(lcell); + Bitmapset *fle_relids = NULL; + ExecNodes *tmp_en; + ExecNodes *en = pgxc_FQS_find_datanodes_recurse(fromlist_entry, + query, &fle_relids); + /* + * If any entry in fromlist is not shippable, jointree is not + * shippable + */ + if (!en) + { + FreeExecNodes(&result_en); + return NULL; + } + + /* FQS does't ship a DML with more than one relation involved */ + if (!first && query->commandType != CMD_SELECT) + { + FreeExecNodes(&result_en); + return NULL; + } + + if (first) + { + first = false; + result_en = en; + from_relids = fle_relids; + continue; + } + + tmp_en = result_en; + /* + * Check whether the JOIN is pushable to the datanodes and + * find the datanodes where the JOIN can be pushed to + */ + result_en = pgxc_is_join_shippable(result_en, en, from_relids, + fle_relids, JOIN_INNER, + make_ands_implicit((Expr *)from_expr->quals), + query_rtable); + from_relids = bms_join(from_relids, fle_relids); + FreeExecNodes(&tmp_en); + } + + *relids = from_relids; + return result_en; + } + break; + + case T_RangeTblRef: + { + RangeTblRef *rtr = (RangeTblRef *)node; + *relids = bms_make_singleton(rtr->rtindex); + return pgxc_FQS_datanodes_for_rtr(rtr->rtindex, query); + } + break; + + case T_JoinExpr: + { + JoinExpr *join_expr = (JoinExpr *)node; + Bitmapset *l_relids = NULL; + Bitmapset *r_relids = NULL; + ExecNodes *len; + ExecNodes *ren; + ExecNodes *result_en; + + /* FQS does't ship a DML with more than one relation involved */ + if (query->commandType != CMD_SELECT) + return NULL; + + len = pgxc_FQS_find_datanodes_recurse(join_expr->larg, query, + &l_relids); + ren = pgxc_FQS_find_datanodes_recurse(join_expr->rarg, query, + &r_relids); + /* If either side of JOIN is unshippable, JOIN is unshippable */ + if (!len || !ren) + { + FreeExecNodes(&len); + FreeExecNodes(&ren); + return NULL; + } + /* + * Check whether the JOIN is pushable or not, and find the datanodes + * where the JOIN can be pushed to. + */ + result_en = pgxc_is_join_shippable(ren, len, r_relids, l_relids, + join_expr->jointype, + make_ands_implicit((Expr *)join_expr->quals), + query_rtable); + FreeExecNodes(&len); + FreeExecNodes(&ren); + *relids = bms_join(l_relids, r_relids); + return result_en; + } + break; + + default: + *relids = NULL; + return NULL; + break; + } + /* Keep compiler happy */ + return NULL; +} + +/* + * pgxc_FQS_find_datanodes + * Find the list of nodes where to ship query. + */ +static ExecNodes * +pgxc_FQS_find_datanodes(Query *query) +{ + Bitmapset *relids = NULL; + ExecNodes *exec_nodes; + + /* + * For SELECT, the datanodes required to execute the query is obtained from + * the join tree of the query + */ + exec_nodes = pgxc_FQS_find_datanodes_recurse((Node *)query->jointree, + query, &relids); + bms_free(relids); + relids = NULL; + + /* If we found the datanodes to ship, use them */ + if (exec_nodes && exec_nodes->nodeList) + { + /* + * If relations involved in the query are such that ultimate JOIN is + * replicated JOIN, choose only one of them. If one of them is a + * preferred node choose that one, otherwise choose the first one. + */ + if (IsLocatorReplicated(exec_nodes->baselocatortype) && + exec_nodes->accesstype == RELATION_ACCESS_READ) + { + List *tmp_list = exec_nodes->nodeList; + exec_nodes->nodeList = GetPreferredReplicationNode(exec_nodes->nodeList); + list_free(tmp_list); + } + return exec_nodes; + } + /* + * If we found the expression which can decide which can be used to decide + * where to ship the query, use that + */ + else if (exec_nodes && exec_nodes->en_expr) + return exec_nodes; + /* No way to figure out datanodes to ship the query to */ + return NULL; +} + + +/* + * pgxc_FQS_get_relation_nodes + * Return ExecNodes structure so as to decide which node the query should + * execute on. If it is possible to set the node list directly, set it. + * Otherwise set the appropriate distribution column expression or relid in + * ExecNodes structure. + */ +static ExecNodes * +pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query) +{ + CmdType command_type = query->commandType; + bool for_update = query->rowMarks ? true : false; + ExecNodes *rel_exec_nodes; + RelationAccessType rel_access = RELATION_ACCESS_READ; + RelationLocInfo *rel_loc_info; + + Assert(rte == rt_fetch(varno, (query->rtable))); + + switch (command_type) + { + case CMD_SELECT: + if (for_update) + rel_access = RELATION_ACCESS_READ_FOR_UPDATE; + else + rel_access = RELATION_ACCESS_READ; + break; + + case CMD_UPDATE: + case CMD_DELETE: + rel_access = RELATION_ACCESS_UPDATE; + break; + + case CMD_INSERT: + rel_access = RELATION_ACCESS_INSERT; + break; + + default: + /* should not happen, but */ + elog(ERROR, "Unrecognised command type %d", command_type); + break; + } + + rel_loc_info = GetRelationLocInfo(rte->relid); + /* If we don't know about the distribution of relation, bail out */ + if (!rel_loc_info) + return NULL; + + /* + * Find out the datanodes to execute this query on. + * PGXC_FQS_TODO: for now, we apply node reduction only when there is only + * one relation involved in the query. If there are multiple distributed + * tables in the query and we apply node reduction here, we may fail to ship + * the entire join. We should apply node reduction transitively. + */ + if (list_length(query->rtable) == 1) + rel_exec_nodes = GetRelationNodesByQuals(rte->relid, varno, + query->jointree->quals, rel_access); + else + rel_exec_nodes = GetRelationNodes(rel_loc_info, (Datum) 0, + true, rel_access); + + if (!rel_exec_nodes) + return NULL; + + if (rel_access == RELATION_ACCESS_INSERT && + IsRelationDistributedByValue(rel_loc_info)) + { + ListCell *lc; + TargetEntry *tle; + /* + * If the INSERT is happening on a table distributed by value of a + * column, find out the + * expression for distribution column in the targetlist, and stick in + * in ExecNodes, and clear the nodelist. Execution will find + * out where to insert the row. + */ + /* It is a partitioned table, get value by looking in targetList */ + foreach(lc, query->targetList) + { + tle = (TargetEntry *) lfirst(lc); + + if (tle->resjunk) + continue; + if (strcmp(tle->resname, GetRelationDistribColumn(rel_loc_info)) == 0) + break; + } + /* Not found, bail out */ + if (!lc) + return NULL; + + Assert(tle); + /* We found the TargetEntry for the partition column */ + list_free(rel_exec_nodes->primarynodelist); + rel_exec_nodes->primarynodelist = NULL; + list_free(rel_exec_nodes->nodeList); + rel_exec_nodes->nodeList = NULL; + rel_exec_nodes->en_expr = tle->expr; + rel_exec_nodes->en_relid = rel_loc_info->relid; + } + return rel_exec_nodes; +} + +bool +pgxc_query_has_distcolgrouping(Query *query) +{ + ListCell *lcell; + foreach (lcell, query->groupClause) + { + SortGroupClause *sgc = lfirst(lcell); + Node *sgc_expr; + if (!IsA(sgc, SortGroupClause)) + continue; + sgc_expr = get_sortgroupclause_expr(sgc, query->targetList); + if (IsA(sgc_expr, Var) && + pgxc_is_var_distrib_column((Var *)sgc_expr, query->rtable)) + return true; + } + return false; +} + +static bool +pgxc_distinct_has_distcol(Query *query) +{ + ListCell *lcell; + foreach (lcell, query->distinctClause) + { + SortGroupClause *sgc = lfirst(lcell); + Node *sgc_expr; + if (!IsA(sgc, SortGroupClause)) + continue; + sgc_expr = get_sortgroupclause_expr(sgc, query->targetList); + if (IsA(sgc_expr, Var) && + pgxc_is_var_distrib_column((Var *)sgc_expr, query->rtable)) + return true; + } + return false; +} + +/* + * pgxc_shippability_walker + * walks the query/expression tree routed at the node passed in, gathering + * information which will help decide whether the query to which this node + * belongs is shippable to the Datanodes. + * + * The function should try to walk the entire tree analysing each subquery for + * shippability. If a subquery is shippable but not the whole query, we would be + * able to create a RemoteQuery node for that subquery, shipping it to the + * Datanode. + * + * Return value of this function is governed by the same rules as + * expression_tree_walker(), see prologue of that function for details. + */ +static bool +pgxc_shippability_walker(Node *node, Shippability_context *sc_context) +{ + if (node == NULL) + return false; + + /* Below is the list of nodes that can appear in a query, examine each + * kind of node and find out under what conditions query with this node can + * be shippable. For each node, update the context (add fields if + * necessary) so that decision whether to FQS the query or not can be made. + * Every node which has a result is checked to see if the result type of that + * expression is shippable. + */ + switch(nodeTag(node)) + { + /* Constants are always shippable */ + case T_Const: + pgxc_set_exprtype_shippability(exprType(node), sc_context); + break; + + /* + * For placeholder nodes the shippability of the node, depends upon the + * expression which they refer to. It will be checked separately, when + * that expression is encountered. + */ + case T_CaseTestExpr: + pgxc_set_exprtype_shippability(exprType(node), sc_context); + break; + + /* + * record_in() function throws error, thus requesting a result in the + * form of anonymous record from datanode gets into error. Hence, if the + * top expression of a target entry is ROW(), it's not shippable. + */ + case T_TargetEntry: + { + TargetEntry *tle = (TargetEntry *)node; + if (tle->expr) + { + char typtype = get_typtype(exprType((Node *)tle->expr)); + if (!typtype || typtype == TYPTYPE_PSEUDO) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + } + break; + + case T_SortGroupClause: + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + break; + + case T_CoerceViaIO: + { + CoerceViaIO *cvio = (CoerceViaIO *)node; + Oid input_type = exprType((Node *)cvio->arg); + Oid output_type = cvio->resulttype; + CoercionContext cc; + + cc = cvio->coerceformat == COERCE_IMPLICIT_CAST ? COERCION_IMPLICIT : + COERCION_EXPLICIT; + /* + * Internally we use IO coercion for types which do not have casting + * defined for them e.g. cstring::date. If such casts are sent to + * the datanode, those won't be accepted. Hence such casts are + * unshippable. Since it will be shown as an explicit cast. + */ + if (!can_coerce_type(1, &input_type, &output_type, cc)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + /* + * Nodes, which are shippable if the tree rooted under these nodes is + * shippable + */ + case T_CoerceToDomainValue: + /* + * PGXCTODO: mostly, CoerceToDomainValue node appears in DDLs, + * do we handle DDLs here? + */ + case T_FieldSelect: + case T_NamedArgExpr: + case T_RelabelType: + case T_BoolExpr: + /* + * PGXCTODO: we might need to take into account the kind of boolean + * operator we have in the quals and see if the corresponding + * function is immutable. + */ + case T_ArrayCoerceExpr: + case T_ConvertRowtypeExpr: + case T_CaseExpr: + case T_ArrayExpr: + case T_RowExpr: + case T_CollateExpr: + case T_CoalesceExpr: + case T_XmlExpr: + case T_NullTest: + case T_BooleanTest: + case T_CoerceToDomain: + pgxc_set_exprtype_shippability(exprType(node), sc_context); + break; + + case T_List: + case T_RangeTblRef: + break; + + case T_ArrayRef: + /* + * When multiple values of of an array are updated at once + * FQS planner cannot yet handle SQL representation correctly. + * So disable FQS in this case and let standard planner manage it. + */ + case T_FieldStore: + /* + * PostgreSQL deparsing logic does not handle the FieldStore + * for more than one fields (see processIndirection()). So, let's + * handle it through standard planner, where whole row will be + * constructed. + */ + case T_SetToDefault: + /* + * PGXCTODO: we should actually check whether the default value to + * be substituted is shippable to the Datanode. Some cases like + * nextval() of a sequence can not be shipped to the Datanode, hence + * for now default values can not be shipped to the Datanodes + */ + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + pgxc_set_exprtype_shippability(exprType(node), sc_context); + break; + + case T_Var: + { + Var *var = (Var *)node; + /* + * if a subquery references an upper level variable, that query is + * not shippable, if shipped alone. + */ + if (var->varlevelsup > sc_context->sc_max_varlevelsup) + sc_context->sc_max_varlevelsup = var->varlevelsup; + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + + case T_Param: + { + Param *param = (Param *)node; + /* PGXCTODO: Can we handle internally generated parameters? */ + if (param->paramkind != PARAM_EXTERN) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + + case T_CurrentOfExpr: + { + /* + * Ideally we should not see CurrentOf expression here, it + * should have been replaced by the CTID = ? expression. But + * still, no harm in shipping it as is. + */ + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + + case T_Aggref: + { + Aggref *aggref = (Aggref *)node; + /* + * An aggregate is completely shippable to the Datanode, if the + * whole group resides on that Datanode. This will be clear when + * we see the GROUP BY clause. + * agglevelsup is minimum of variable's varlevelsup, so we will + * set the sc_max_varlevelsup when we reach the appropriate + * VARs in the tree. + */ + pgxc_set_shippability_reason(sc_context, SS_HAS_AGG_EXPR); + /* + * If a stand-alone expression to be shipped, is an + * 1. aggregate with ORDER BY, DISTINCT directives, it needs all + * the qualifying rows + * 2. aggregate without collection function + * 3. (PGXCTODO:)aggregate with polymorphic transition type, the + * the transition type needs to be resolved to correctly interpret + * the transition results from Datanodes. + * Hence, such an expression can not be shipped to the datanodes. + */ + if (aggref->aggorder || + aggref->aggdistinct || + aggref->agglevelsup || + !aggref->agghas_collectfn || + IsPolymorphicType(aggref->aggtrantype)) + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + + case T_FuncExpr: + { + FuncExpr *funcexpr = (FuncExpr *)node; + /* + * PGXC_FQS_TODO: it's too restrictive not to ship non-immutable + * functions to the Datanode. We need a better way to see what + * can be shipped to the Datanode and what can not be. + */ + if (!pgxc_is_func_shippable(funcexpr->funcid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + + /* + * If this is a stand alone expression and the function returns a + * set of rows, we need to handle it along with the final result of + * other expressions. So, it can not be shippable. + */ + if (funcexpr->funcretset && sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + + case T_OpExpr: + case T_DistinctExpr: /* struct-equivalent to OpExpr */ + case T_NullIfExpr: /* struct-equivalent to OpExpr */ + { + /* + * All of these three are structurally equivalent to OpExpr, so + * cast the node to OpExpr and check if the operator function is + * immutable. See PGXC_FQS_TODO item for FuncExpr. + */ + OpExpr *op_expr = (OpExpr *)node; + Oid opfuncid = OidIsValid(op_expr->opfuncid) ? + op_expr->opfuncid : get_opcode(op_expr->opno); + if (!OidIsValid(opfuncid) || + !pgxc_is_func_shippable(opfuncid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + + case T_ScalarArrayOpExpr: + { + /* + * Check if the operator function is shippable to the Datanode + * PGXC_FQS_TODO: see immutability note for FuncExpr above + */ + ScalarArrayOpExpr *sao_expr = (ScalarArrayOpExpr *)node; + Oid opfuncid = OidIsValid(sao_expr->opfuncid) ? + sao_expr->opfuncid : get_opcode(sao_expr->opno); + if (!OidIsValid(opfuncid) || + !pgxc_is_func_shippable(opfuncid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + break; + + case T_RowCompareExpr: + case T_MinMaxExpr: + { + /* + * PGXCTODO should we be checking the comparision operator + * functions as well, as we did for OpExpr OR that check is + * unnecessary. Operator functions are always shippable? + * Otherwise this node should be treated similar to other + * "shell" nodes. + */ + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; + + case T_Query: + { + Query *query = (Query *)node; + + /* PGXCTODO : If the query has a returning list, it is not shippable as of now */ + if (query->returningList) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + /* A stand-alone expression containing Query is not shippable */ + if (sc_context->sc_for_expr) + { + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + break; + } + /* + * We are checking shippability of whole query, go ahead. The query + * in the context should be same as the query being checked + */ + Assert(query == sc_context->sc_query); + + /* CREATE TABLE AS is not supported in FQS */ + if (query->commandType == CMD_UTILITY && + IsA(query->utilityStmt, CreateTableAsStmt)) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + if (query->hasRecursive) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + /* + * If the query needs Coordinator for evaluation or the query can be + * completed on Coordinator itself, we don't ship it to the Datanode + */ + if (pgxc_query_needs_coord(query)) + pgxc_set_shippability_reason(sc_context, SS_NEEDS_COORD); + + /* PGXCTODO: It should be possible to look at the Query and find out + * whether it can be completely evaluated on the Datanode just like SELECT + * queries. But we need to be careful while finding out the Datanodes to + * execute the query on, esp. for the result relations. If one happens to + * remove/change this restriction, make sure you change + * pgxc_FQS_get_relation_nodes appropriately. + * For now DMLs with single rtable entry are candidates for FQS + */ + if (query->commandType != CMD_SELECT && list_length(query->rtable) > 1) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + /* + * In following conditions query is shippable when there is only one + * Datanode involved + * 1. the query has aggregagtes without grouping by distribution + * column + * 2. the query has window functions + * 3. the query has ORDER BY clause + * 4. the query has Distinct clause without distribution column in + * distinct clause + * 5. the query has limit and offset clause + */ + if (query->hasWindowFuncs || query->sortClause || + query->limitOffset || query->limitCount) + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + /* + * Presence of aggregates or having clause, implies grouping. In + * such cases, the query won't be shippable unless 1. there is only + * a single node involved 2. GROUP BY clause has distribution column + * in it. In the later case aggregates for a given group are entirely + * computable on a single datanode, because all the rows + * participating in particular group reside on that datanode. + * The distribution column can be of any relation + * participating in the query. All the rows of that relation with + * the same value of distribution column reside on same node. + */ + if ((query->hasAggs || query->havingQual) && + !pgxc_query_has_distcolgrouping(query)) + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + /* + * If distribution column of any relation is present in the distinct + * clause, values for that column across nodes will differ, thus two + * nodes won't be able to produce same result row. Hence in such + * case, we can execute the queries on many nodes managing to have + * distinct result. + */ + if (query->distinctClause && !pgxc_distinct_has_distcol(query)) + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + + if ((query->commandType == CMD_UPDATE) && + pgxc_targetlist_has_distcol(query)) + pgxc_set_shippability_reason(sc_context, SS_UPDATES_DISTRIBUTION_COLUMN); + + + /* + * walk the entire query tree to analyse the query. We will walk the + * range table, when examining the FROM clause. No need to do it + * here + */ + if (query_tree_walker(query, pgxc_shippability_walker, + sc_context, QTW_IGNORE_RANGE_TABLE )) + return true; + + /* + * PGXC_FQS_TODO: + * There is a subquery in this query, which references Vars in the upper + * query. For now stop shipping such queries. We should get rid of this + * condition. + */ + if (sc_context->sc_max_varlevelsup != 0) + pgxc_set_shippability_reason(sc_context, SS_VARLEVEL); + + /* + * Walk the join tree of the query and find the + * Datanodes needed for evaluating this query + */ + sc_context->sc_exec_nodes = pgxc_FQS_find_datanodes(query); + } + break; + + case T_FromExpr: + { + /* We don't expect FromExpr in a stand-alone expression */ + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + /* + * We will examine the jointree of query separately to determine the + * set of datanodes where to execute the query. + * If this is an INSERT query with quals, resulting from say + * conditional rule, we can not handle those in FQS, since there is + * not SQL representation for such quals. + */ + if (sc_context->sc_query->commandType == CMD_INSERT && + ((FromExpr *)node)->quals) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + } + break; + + case T_WindowFunc: + { + WindowFunc *winf = (WindowFunc *)node; + /* + * A window function can be evaluated on a Datanode if there is + * only one Datanode involved. + */ + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + /* + * A window function is not shippable as part of a stand-alone + * expression. If the window function is non-immutable, it can not + * be shipped to the datanodes. + */ + if (sc_context->sc_for_expr || + !pgxc_is_func_shippable(winf->winfnoid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + + pgxc_set_exprtype_shippability(exprType(node), sc_context); + } + break; ++ ++ case T_GroupingFunc: ++ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); ++ break; + + case T_WindowClause: + { + /* + * A window function can be evaluated on a Datanode if there is + * only one Datanode involved. + */ + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + /* + * A window function is not shippable as part of a stand-alone + * expression + */ + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + break; + + case T_JoinExpr: + /* We don't expect JoinExpr in a stand-alone expression */ + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + /* + * The shippability of join will be deduced while + * examining the jointree of the query. Nothing to do here + */ + break; + + case T_SubLink: + { + /* + * We need to walk the tree in sublink to check for its + * shippability. We need to call pgxc_is_query_shippable() on Query + * instead of this function so that every subquery gets a different + * context for itself. We should avoid the default expression walker + * getting called on the subquery. At the same time we don't want to + * miss any other member (current or future) of this structure, from + * being scanned. So, copy the SubLink structure with subselect + * being NULL and call expression_tree_walker on the copied + * structure. + */ + SubLink sublink = *(SubLink *)node; + ExecNodes *sublink_en; + /* + * Walk the query and find the nodes where the query should be + * executed and node distribution. Merge this with the existing + * node list obtained for other subqueries. If merging fails, we + * can not ship the whole query. + */ + if (IsA(sublink.subselect, Query)) + sublink_en = pgxc_is_query_shippable((Query *)(sublink.subselect), + sc_context->sc_query_level); + else + sublink_en = NULL; + + /* PGXCTODO free the old sc_subquery_en. */ + /* If we already know that this query does not have a set of nodes + * to evaluate on, don't bother to merge again. + */ + if (!pgxc_test_shippability_reason(sc_context, SS_NO_NODES)) + { + /* + * If this is the first time we are finding out the nodes for + * SubLink, we don't have anything to merge, just assign. + */ + if (!sc_context->sc_subquery_en) + sc_context->sc_subquery_en = sublink_en; + /* + * Merge if only the accumulated SubLink ExecNodes and the + * ExecNodes for this subquery are both replicated. + */ + else if (sublink_en && IsExecNodesReplicated(sublink_en) && + IsExecNodesReplicated(sc_context->sc_subquery_en)) + { + sc_context->sc_subquery_en = pgxc_merge_exec_nodes(sublink_en, + sc_context->sc_subquery_en); + } + else + sc_context->sc_subquery_en = NULL; + + /* + * If we didn't find a cumulative ExecNodes, set shippability + * reason, so that we don't bother merging future sublinks. + */ + if (!sc_context->sc_subquery_en) + pgxc_set_shippability_reason(sc_context, SS_NO_NODES); + } + else + Assert(!sc_context->sc_subquery_en); + + /* Check if the type of sublink result is shippable */ + pgxc_set_exprtype_shippability(exprType(node), sc_context); + + /* Wipe out subselect as explained above and walk the copied tree */ + sublink.subselect = NULL; + return expression_tree_walker((Node *)&sublink, pgxc_shippability_walker, + sc_context); + } + break; + + case T_SubPlan: + case T_AlternativeSubPlan: + case T_CommonTableExpr: + case T_SetOperationStmt: + case T_PlaceHolderVar: + case T_AppendRelInfo: + case T_PlaceHolderInfo: + case T_OnConflictExpr: + case T_WithCheckOption: + { + /* PGXCTODO: till we exhaust this list */ + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + /* + * These expressions are not supported for shippability entirely, so + * there is no need to walk trees underneath those. If we do so, we + * might walk the trees with wrong context there. + */ + return false; + } + break; + + default: + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(node)); + break; + } + + return expression_tree_walker(node, pgxc_shippability_walker, (void *)sc_context); +} + + +/* + * pgxc_query_needs_coord + * Check if the query needs Coordinator for evaluation or it can be completely + * evaluated on Coordinator. Return true if so, otherwise return false. + */ +static bool +pgxc_query_needs_coord(Query *query) +{ + /* + * If the query involves just the catalog tables, and is not an EXEC DIRECT + * statement, it can be evaluated completely on the Coordinator. No need to + * involve Datanodes. + */ + if (pgxc_query_contains_only_pg_catalog(query->rtable)) + return true; + + return false; +} + + +/* + * pgxc_is_var_distrib_column + * Check if given var is a distribution key. + */ +static +bool pgxc_is_var_distrib_column(Var *var, List *rtable) +{ + RangeTblEntry *rte = rt_fetch(var->varno, rtable); + RelationLocInfo *rel_loc_info; + + /* distribution column only applies to the relations */ + if (rte->rtekind != RTE_RELATION || + rte->relkind != RELKIND_RELATION) + return false; + rel_loc_info = GetRelationLocInfo(rte->relid); + if (!rel_loc_info) + return false; + if (var->varattno == rel_loc_info->partAttrNum) + return true; + return false; +} + + +/* + * Returns whether or not the rtable (and its subqueries) + * only contain pg_catalog entries. + */ +static bool +pgxc_query_contains_only_pg_catalog(List *rtable) +{ + ListCell *item; + + /* May be complicated. Before giving up, just check for pg_catalog usage */ + foreach(item, rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(item); + + if (rte->rtekind == RTE_RELATION) + { + if (get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE) + return false; + } + else if (rte->rtekind == RTE_SUBQUERY && + !pgxc_query_contains_only_pg_catalog(rte->subquery->rtable)) + return false; + } + return true; +} + + +/* + * pgxc_is_query_shippable + * This function calls the query walker to analyse the query to gather + * information like Constraints under which the query can be shippable, nodes + * on which the query is going to be executed etc. + * Based on the information gathered, it decides whether the query can be + * executed on Datanodes directly without involving Coordinator. + * If the query is shippable this routine also returns the nodes where the query + * should be shipped. If the query is not shippable, it returns NULL. + */ +ExecNodes * +pgxc_is_query_shippable(Query *query, int query_level) +{ + Shippability_context sc_context; + ExecNodes *exec_nodes; + bool canShip = true; + Bitmapset *shippability; + + memset(&sc_context, 0, sizeof(sc_context)); + /* let's assume that by default query is shippable */ + sc_context.sc_query = query; + sc_context.sc_query_level = query_level; + sc_context.sc_for_expr = false; + + /* + * We might have already decided not to ship the query to the Datanodes, but + * still walk it anyway to find out if there are any subqueries which can be + * shipped. + */ + pgxc_shippability_walker((Node *)query, &sc_context); + + exec_nodes = sc_context.sc_exec_nodes; + /* + * The shippability context contains two ExecNodes, one for the subLinks + * involved in the Query and other for the relation involved in FromClause. + * They are computed at different times while scanning the query. Merge both + * of them if they are both replicated. If query doesn't have SubLinks, we + * don't need to consider corresponding ExecNodes. + * PGXC_FQS_TODO: + * Merge the subquery ExecNodes if both of them are replicated. + * The logic to merge node lists with other distribution + * strategy is not clear yet. + */ + if (query->hasSubLinks) + { + if (exec_nodes && IsExecNodesReplicated(exec_nodes) && + sc_context.sc_subquery_en && + IsExecNodesReplicated(sc_context.sc_subquery_en)) + exec_nodes = pgxc_merge_exec_nodes(exec_nodes, + sc_context.sc_subquery_en); + else + exec_nodes = NULL; + } + + /* + * Look at the information gathered by the walker in Shippability_context and that + * in the Query structure to decide whether we should ship this query + * directly to the Datanode or not + */ + + /* + * If the planner was not able to find the Datanodes to the execute the + * query, the query is not completely shippable. So, return NULL + */ + if (!exec_nodes) + return NULL; + + /* Copy the shippability reasons. We modify the copy for easier handling. + * The original can be saved away */ + shippability = bms_copy(sc_context.sc_shippability); + + /* + * If the query has an expression which renders the shippability to single + * node, and query needs to be shipped to more than one node, it can not be + * shipped + */ + if (bms_is_member(SS_NEED_SINGLENODE, shippability)) + { + /* + * if nodeList has no nodes, it ExecNodes will have other means to know + * the nodes where to execute like distribution column expression. We + * can't tell how many nodes the query will be executed on, hence treat + * that as multiple nodes. + */ + if (list_length(exec_nodes->nodeList) != 1) + canShip = false; + + /* We handled the reason here, reset it */ + shippability = bms_del_member(shippability, SS_NEED_SINGLENODE); + } + + /* + * If HAS_AGG_EXPR is set but NEED_SINGLENODE is not set, it means the + * aggregates are entirely shippable, so don't worry about it. + */ + shippability = bms_del_member(shippability, SS_HAS_AGG_EXPR); + + /* Can not ship the query for some reason */ + if (!bms_is_empty(shippability)) + canShip = false; + + /* Always keep this at the end before checking canShip and return */ + if (!canShip && exec_nodes) + FreeExecNodes(&exec_nodes); + /* If query is to be shipped, we should know where to execute the query */ + Assert (!canShip || exec_nodes); + + bms_free(shippability); + shippability = NULL; + + return exec_nodes; +} + + +/* + * pgxc_is_expr_shippable + * Check whether the given expression can be shipped to datanodes. + * + * Note on has_aggs + * The aggregate expressions are not shippable if they can not be completely + * evaluated on a single datanode. But this function does not have enough + * context to determine the set of datanodes where the expression will be + * evaluated. Hence, the caller of this function can handle aggregate + * expressions, it passes a non-NULL value for has_aggs. This function returns + * whether the expression has any aggregates or not through this argument. If a + * caller passes NULL value for has_aggs, this function assumes that the caller + * can not handle the aggregates and deems the expression has unshippable. + */ +bool +pgxc_is_expr_shippable(Expr *node, bool *has_aggs) +{ + Shippability_context sc_context; + + /* Create the FQS context */ + memset(&sc_context, 0, sizeof(sc_context)); + sc_context.sc_query = NULL; + sc_context.sc_query_level = 0; + sc_context.sc_for_expr = true; + + /* Walk the expression to check its shippability */ + pgxc_shippability_walker((Node *)node, &sc_context); + + /* + * If caller is interested in knowing, whether the expression has aggregates + * let the caller know about it. The caller is capable of handling such + * expressions. Otherwise assume such an expression as not shippable. + */ + if (has_aggs) + *has_aggs = pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR); + else if (pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR)) + return false; + /* Done with aggregate expression shippability. Delete the status */ + pgxc_reset_shippability_reason(&sc_context, SS_HAS_AGG_EXPR); + + /* If there are reasons why the expression is unshippable, return false */ + if (!bms_is_empty(sc_context.sc_shippability)) + return false; + + /* If nothing wrong found, the expression is shippable */ + return true; +} + + +/* + * pgxc_is_func_shippable + * Determine if a function is shippable + */ +bool +pgxc_is_func_shippable(Oid funcid) +{ + /* + * For the time being a function is thought as shippable + * only if it is immutable. + */ + return func_volatile(funcid) == PROVOLATILE_IMMUTABLE; +} + + +/* + * pgxc_find_dist_equijoin_qual + * Check equijoin conditions on given relations + */ +Expr * +pgxc_find_dist_equijoin_qual(Relids varnos_1, + Relids varnos_2, Oid distcol_type, Node *quals, List *rtable) +{ + List *lquals; + ListCell *qcell; + + /* If no quals, no equijoin */ + if (!quals) + return false; + /* + * Make a copy of the argument bitmaps, it will be modified by + * bms_first_member(). + */ + varnos_1 = bms_copy(varnos_1); + varnos_2 = bms_copy(varnos_2); + + if (!IsA(quals, List)) + lquals = make_ands_implicit((Expr *)quals); + else + lquals = (List *)quals; + + foreach(qcell, lquals) + { + Expr *qual_expr = (Expr *)lfirst(qcell); + OpExpr *op; + Var *lvar; + Var *rvar; + + if (!IsA(qual_expr, OpExpr)) + continue; + op = (OpExpr *)qual_expr; + /* If not a binary operator, it can not be '='. */ + if (list_length(op->args) != 2) + continue; + + /* + * Check if both operands are Vars, if not check next expression */ + if (IsA(linitial(op->args), Var) && IsA(lsecond(op->args), Var)) + { + lvar = (Var *)linitial(op->args); + rvar = (Var *)lsecond(op->args); + } + else + continue; + + /* + * If the data types of both the columns are not same, continue. Hash + * and Modulo of a the same bytes will be same if the data types are + * same. So, only when the data types of the columns are same, we can + * ship a distributed JOIN to the Datanodes + */ + if (exprType((Node *)lvar) != exprType((Node *)rvar)) + continue; + + /* if the vars do not correspond to the required varnos, continue. */ + if ((bms_is_member(lvar->varno, varnos_1) && bms_is_member(rvar->varno, varnos_2)) || + (bms_is_member(lvar->varno, varnos_2) && bms_is_member(rvar->varno, varnos_1))) + { + if (!pgxc_is_var_distrib_column(lvar, rtable) || + !pgxc_is_var_distrib_column(rvar, rtable)) + continue; + } + else + continue; + /* + * If the operator is not an assignment operator, check next + * constraint. An operator is an assignment operator if it's + * mergejoinable or hashjoinable. Beware that not every assignment + * operator is mergejoinable or hashjoinable, so we might leave some + * oportunity. But then we have to rely on the opname which may not + * be something we know to be equality operator as well. + */ + if (!op_mergejoinable(op->opno, exprType((Node *)lvar)) && + !op_hashjoinable(op->opno, exprType((Node *)lvar))) + continue; + /* Found equi-join condition on distribution columns */ + return qual_expr; + } + return NULL; +} + + +/* + * pgxc_merge_exec_nodes + * The routine combines the two exec_nodes passed such that the resultant + * exec_node corresponds to the JOIN of respective relations. + * If both exec_nodes can not be merged, it returns NULL. + */ +ExecNodes * +pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2) +{ + ExecNodes *merged_en = makeNode(ExecNodes); + ExecNodes *tmp_en; + + /* If either of exec_nodes are NULL, return the copy of other one */ + if (!en1) + { + tmp_en = copyObject(en2); + return tmp_en; + } + if (!en2) + { + tmp_en = copyObject(en1); + return tmp_en; + } + + /* Following cases are not handled in this routine */ + /* PGXC_FQS_TODO how should we handle table usage type? */ + if (en1->primarynodelist || en2->primarynodelist || + en1->en_expr || en2->en_expr || + OidIsValid(en1->en_relid) || OidIsValid(en2->en_relid) || + en1->accesstype != RELATION_ACCESS_READ || en2->accesstype != RELATION_ACCESS_READ) + return NULL; + + if (IsExecNodesReplicated(en1) && + IsExecNodesReplicated(en2)) + { + /* + * Replicated/replicated join case + * Check that replicated relation is not disjoint + * with initial relation which is also replicated. + * If there is a common portion of the node list between + * the two relations, other rtables have to be checked on + * this restricted list. + */ + merged_en->nodeList = list_intersection_int(en1->nodeList, + en2->nodeList); + merged_en->baselocatortype = LOCATOR_TYPE_REPLICATED; + if (!merged_en->nodeList) + FreeExecNodes(&merged_en); + return merged_en; + } + + if (IsExecNodesReplicated(en1) && + IsExecNodesColumnDistributed(en2)) + { + List *diff_nodelist = NULL; + /* + * Replicated/distributed join case. + * Node list of distributed table has to be included + * in node list of replicated table. + */ + diff_nodelist = list_difference_int(en2->nodeList, en1->nodeList); + /* + * If the difference list is not empty, this means that node list of + * distributed table is not completely mapped by node list of replicated + * table, so go through standard planner. + */ + if (diff_nodelist) + FreeExecNodes(&merged_en); + else + { + merged_en->nodeList = list_copy(en2->nodeList); + merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED; + } + return merged_en; + } + + if (IsExecNodesColumnDistributed(en1) && + IsExecNodesReplicated(en2)) + { + List *diff_nodelist = NULL; + /* + * Distributed/replicated join case. + * Node list of distributed table has to be included + * in node list of replicated table. + */ + diff_nodelist = list_difference_int(en1->nodeList, en2->nodeList); + + /* + * If the difference list is not empty, this means that node list of + * distributed table is not completely mapped by node list of replicated + * table, so go through standard planner. + */ + if (diff_nodelist) + FreeExecNodes(&merged_en); + else + { + merged_en->nodeList = list_copy(en1->nodeList); + merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED; + } + return merged_en; + } + + if (IsExecNodesColumnDistributed(en1) && + IsExecNodesColumnDistributed(en2)) + { + /* + * Distributed/distributed case + * If the caller has suggested that this is an equi-join between two + * distributed results, check that they have the same nodes in the distribution + * node list. The caller is expected to fully decide whether to merge + * the nodes or not. + */ + if (!list_difference_int(en1->nodeList, en2->nodeList) && + !list_difference_int(en2->nodeList, en1->nodeList)) + { + merged_en->nodeList = list_copy(en1->nodeList); + if (en1->baselocatortype == en2->baselocatortype) + merged_en->baselocatortype = en1->baselocatortype; + else + merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED; + } + else + FreeExecNodes(&merged_en); + return merged_en; + } + + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XC does not support this distribution type yet"), + errdetail("The feature is not currently supported"))); + + /* Keep compiler happy */ + return NULL; +} + + +/* + * pgxc_check_index_shippability + * Check shippability of index described by given conditions. This generic + * function can be called even if the index is not yet defined. + */ +bool +pgxc_check_index_shippability(RelationLocInfo *relLocInfo, + bool is_primary, + bool is_unique, + bool is_exclusion, + List *indexAttrs, + List *indexExprs) +{ + bool result = true; + ListCell *lc; + + /* + * Leave if no locator information, in this case shippability has no + * meaning. + */ + if (!relLocInfo) + return result; + + /* + * Scan the expressions used in index and check the shippability of each + * of them. If only one is not-shippable, the index is considered as non + * shippable. It is important to check the shippability of the expressions + * before refining scan on the index columns and distribution type of + * parent relation. + */ + foreach(lc, indexExprs) + { + if (!pgxc_is_expr_shippable((Expr *) lfirst(lc), NULL)) + { + /* One of the expressions is not shippable, so leave */ + result = false; + goto finish; + } + } + + /* + * Check if relation is distributed on a single node, in this case + * the constraint can be shipped in all the cases. + */ + if (list_length(relLocInfo->nodeList) == 1) + return result; + + /* + * Check the case of EXCLUSION index. + * EXCLUSION constraints are shippable only for replicated relations as + * such constraints need that one tuple is checked on all the others, and + * if this tuple is correctly excluded of the others, the constraint is + * verified. + */ + if (is_exclusion) + { + if (!IsRelationReplicated(relLocInfo)) + { + result = false; + goto finish; + } + } + + /* + * Check the case of PRIMARY KEY INDEX and UNIQUE index. + * Those constraints are shippable if the parent relation is replicated + * or if the column + */ + if (is_unique || + is_primary) + { + /* + * Perform different checks depending on distribution type of parent + * relation. + */ + switch(relLocInfo->locatorType) + { + case LOCATOR_TYPE_REPLICATED: + /* In the replicated case this index is shippable */ + result = true; + break; + + case LOCATOR_TYPE_RROBIN: + /* + * Index on roundrobin parent table cannot be safely shipped + * because of the random behavior of data balancing. + */ + result = false; + break; + + case LOCATOR_TYPE_HASH: + case LOCATOR_TYPE_MODULO: + /* + * Unique indexes on Hash and Modulo tables are shippable if the + * index expression contains all the distribution expressions of + * its parent relation. + * + * Here is a short example with concatenate that cannot be + * shipped: + * CREATE TABLE aa (a text, b text) DISTRIBUTE BY HASH(a); + * CREATE UNIQUE INDEX aap ON aa((a || b)); + * INSERT INTO aa VALUES ('a', 'abb'); + * INSERT INTO aa VALUES ('aab', b); -- no error ??! + * The output uniqueness is not guaranteed as both INSERT will + * go to different nodes. For such simple reasons unique + * indexes on distributed tables are not shippable. + * Shippability is not even ensured if all the expressions + * used as Var are only distributed columns as the hash output of + * their value combination does not ensure that query will + * be directed to the correct remote node. Uniqueness is not even + * protected if the index expression contains only the distribution + * column like for that with a cluster of 2 Datanodes: + * CREATE TABLE aa (a int) DISTRIBUTE BY HASH(a); + * CREATE UNIQUE INDEX aap ON (abs(a)); + * INSERT INTO aa (2); -- to Datanode 1 + * INSERT INTO aa (-2); -- to Datanode 2, breaks uniqueness + * + * PGXCTODO: for the time being distribution key can only be + * defined on a single column, so this will need to be changed + * onde a relation distribution will be able to be defined based + * on an expression of multiple columns. + */ + + /* Index contains expressions, it cannot be shipped safely */ + if (indexExprs != NIL) + { + result = false; + break; + } + + /* Nothing to do if no attributes */ + if (indexAttrs == NIL) + break; + + /* + * Check that distribution column is included in the list of + * index columns. + */ + if (!list_member_int(indexAttrs, relLocInfo->partAttrNum)) + { + /* + * Distribution column is not in index column list + * So index can be enforced remotely. + */ + result = false; + break; + } + + /* + * by being here we are now sure that the index can be enforced + * remotely as the distribution column is included in index. + */ + break; + + /* Those types are not supported yet */ + case LOCATOR_TYPE_RANGE: + case LOCATOR_TYPE_NONE: + case LOCATOR_TYPE_DISTRIBUTED: + case LOCATOR_TYPE_CUSTOM: + default: + /* Should not come here */ + Assert(0); + } + } + +finish: + return result; +} + + +/* + * pgxc_check_fk_shippabilily + * Check the shippability of a parent and a child relation based on the + * distribution of each and the columns that are used to reference to + * parent and child relation. This can be used for inheritance or foreign + * key shippability evaluation. + */ +bool +pgxc_check_fk_shippability(RelationLocInfo *parentLocInfo, + RelationLocInfo *childLocInfo, + List *parentRefs, + List *childRefs) +{ + bool result = true; + + Assert(list_length(parentRefs) == list_length(childRefs)); + + /* + * If either child or parent have no relation data, shippability makes + * no sense. + */ + if (!parentLocInfo || !childLocInfo) + return result; + + /* In the case of a child referencing to itself, constraint is shippable */ + if (IsLocatorInfoEqual(parentLocInfo, childLocInfo)) + return result; + + /* Now begin the evaluation */ + switch (parentLocInfo->locatorType) + { + case LOCATOR_TYPE_REPLICATED: + /* + * If the parent relation is replicated, the child relation can + * always refer to it on all the nodes. + */ + result = true; + break; + + case LOCATOR_TYPE_RROBIN: + /* + * If the parent relation is based on roundrobin, the child + * relation cannot be enforced on remote nodes before of the + * random behavior of data balancing. + */ + result = false; + break; + + case LOCATOR_TYPE_HASH: + case LOCATOR_TYPE_MODULO: + /* + * If parent table is distributed, the child table can reference + * to its parent safely if the following conditions are satisfied: + * - parent and child are both hash-based, or both modulo-based + * - parent reference columns contain the distribution column + * of the parent relation + * - child reference columns contain the distribution column + * of the child relation + * - both child and parent map the same nodes for data location + */ + + /* A replicated child cannot refer to a distributed parent */ + if (IsRelationReplicated(childLocInfo)) + { + result = false; + break; + } + + /* + * Parent and child need to have the same distribution type: + * hash or modulo. + */ + if (parentLocInfo->locatorType != childLocInfo->locatorType) + { + result = false; + break; + } + + /* + * Parent and child need to have their data located exactly + * on the same list of nodes. + */ + if (list_difference_int(childLocInfo->nodeList, parentLocInfo->nodeList) || + list_difference_int(parentLocInfo->nodeList, childLocInfo->nodeList)) + { + result = false; + break; + } + + /* + * Check that child and parents are referenced using their + * distribution column. + */ + if (!list_member_int(childRefs, childLocInfo->partAttrNum) || + !list_member_int(parentRefs, parentLocInfo->partAttrNum)) + { + result = false; + break; + } + + /* By being here, parent-child constraint can be shipped correctly */ + break; + + case LOCATOR_TYPE_RANGE: + case LOCATOR_TYPE_NONE: + case LOCATOR_TYPE_DISTRIBUTED: + case LOCATOR_TYPE_CUSTOM: + default: + /* Should not come here */ + Assert(0); + } + + return result; +} + +/* + * pgxc_is_join_reducible + * The shippability of JOIN is decided in following steps + * 1. Are the JOIN conditions shippable? + * For INNER JOIN it's possible to apply some of the conditions at the + * Datanodes and others at coordinator. But for other JOINs, JOIN conditions + * decide which tuples on the OUTER side are appended with NULL columns from + * INNER side, we need all the join conditions to be shippable for the join to + * be shippable. + * 2. Do the JOIN conditions have quals that will make it shippable? + * When both sides of JOIN are replicated, irrespective of the quals the JOIN + * is shippable. + * INNER joins between replicated and distributed relation are shippable + * irrespective of the quals. OUTER join between replicated and distributed + * relation is shippable if distributed relation is the outer relation. + * All joins between hash/modulo distributed relations are shippable if they + * have equi-join on the distributed column, such that distribution columns + * have same datatype and same distribution strategy. + * 3. Are datanodes where the joining relations exist, compatible? + * Joins between replicated relations are shippable if both relations share a + * datanode. Joins between distributed relations are shippable if both + * relations are distributed on same set of Datanodes. Join between replicated + * and distributed relations is shippable is replicated relation is replicated + * on all nodes where distributed relation is distributed. + * + * The first step is to be applied by the caller of this function. + */ +ExecNodes * +pgxc_is_join_shippable(ExecNodes *inner_en, ExecNodes *outer_en, Relids in_relids, + Relids out_relids, JoinType jointype, List *join_quals, + List *rtables) +{ + bool merge_nodes = false; + + /* + * If either of inner_en or outer_en is NULL, return NULL. We can't ship the + * join when either of the sides do not have datanodes to ship to. + */ + if (!outer_en || !inner_en) + return NULL; + /* + * We only support reduction of INNER, LEFT [OUTER] and FULL [OUTER] joins. + * RIGHT [OUTER] join is converted to LEFT [OUTER] join during join tree + * deconstruction. + */ + if (jointype != JOIN_INNER && jointype != JOIN_LEFT && jointype != JOIN_FULL) + return NULL; + + /* If both sides are replicated or have single node each, we ship any kind of JOIN */ + if ((IsExecNodesReplicated(inner_en) && IsExecNodesReplicated(outer_en)) || + (list_length(inner_en->nodeList) == 1 && + list_length(outer_en->nodeList) == 1)) + merge_nodes = true; + + /* If both sides are distributed, ... */ + else if (IsExecNodesColumnDistributed(inner_en) && + IsExecNodesColumnDistributed(outer_en)) + { + /* + * If two sides are distributed in the same manner by a value, with an + * equi-join on the distribution column and that condition + * is shippable, ship the join if node lists from both sides can be + * merged. + */ + if (inner_en->baselocatortype == outer_en->baselocatortype && + IsExecNodesDistributedByValue(inner_en)) + { + Expr *equi_join_expr = pgxc_find_dist_equijoin_qual(in_relids, + out_relids, InvalidOid, + (Node *)join_quals, rtables); + if (equi_join_expr && pgxc_is_expr_shippable(equi_join_expr, NULL)) + merge_nodes = true; + } + } + /* + * If outer side is distributed and inner side is replicated, we can ship + * LEFT OUTER and INNER join. + */ + else if (IsExecNodesColumnDistributed(outer_en) && + IsExecNodesReplicated(inner_en) && + (jointype == JOIN_INNER || jointype == JOIN_LEFT)) + merge_nodes = true; + /* + * If outer side is replicated and inner side is distributed, we can ship + * only for INNER join. + */ + else if (IsExecNodesReplicated(outer_en) && + IsExecNodesColumnDistributed(inner_en) && + jointype == JOIN_INNER) + merge_nodes = true; + /* + * If the ExecNodes of inner and outer nodes can be merged, the JOIN is + * shippable + */ + if (merge_nodes) + return pgxc_merge_exec_nodes(inner_en, outer_en); + else + return NULL; +} + +static +bool pgxc_targetlist_has_distcol(Query *query) +{ + RangeTblEntry *rte = rt_fetch(query->resultRelation, query->rtable); + RelationLocInfo *rel_loc_info; + ListCell *lc; + const char *distcol; + + /* distribution column only applies to the relations */ + if (rte->rtekind != RTE_RELATION || + rte->relkind != RELKIND_RELATION) + return false; + rel_loc_info = GetRelationLocInfo(rte->relid); + if (!rel_loc_info) + return false; + + distcol = GetRelationDistribColumn(rel_loc_info); + if (!distcol) + return false; + + foreach(lc, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + if (tle->resjunk) + continue; + if (strcmp(tle->resname, distcol) == 0) + return true; + } + return false; +} diff --cc src/backend/pgxc/pool/execRemote.c index d3ef43122d,0000000000..d9c89622a1 mode 100644,000000..100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@@ -1,6191 -1,0 +1,6192 @@@ +/*------------------------------------------------------------------------- + * + * execRemote.c + * + * Functions to execute commands on remote Datanodes + * + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/backend/pgxc/pool/execRemote.c + * + *------------------------------------------------------------------------- + */ + +#include +#include "postgres.h" +#include "access/twophase.h" +#include "access/gtm.h" +#include "access/sysattr.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/relscan.h" +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "commands/prepare.h" +#include "executor/executor.h" +#include "gtm/gtm_c.h" +#include "libpq/libpq.h" +#include "miscadmin.h" +#include "pgxc/execRemote.h" +#include "tcop/tcopprot.h" +#include "executor/nodeSubplan.h" +#include "nodes/nodeFuncs.h" +#include "pgstat.h" +#include "nodes/nodes.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/var.h" +#include "pgxc/copyops.h" +#include "pgxc/nodemgr.h" +#include "pgxc/poolmgr.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/tuplesort.h" +#include "utils/snapmgr.h" +#include "utils/builtins.h" +#include "pgxc/locator.h" +#include "pgxc/pgxc.h" +#include "parser/parse_type.h" +#include "parser/parsetree.h" +#include "pgxc/xc_maintenance_mode.h" + +/* Enforce the use of two-phase commit when temporary objects are used */ +bool EnforceTwoPhaseCommit = true; +/* + * We do not want it too long, when query is terminating abnormally we just + * want to read in already available data, if datanode connection will reach a + * consistent state after that, we will go normal clean up procedure: send down + * ABORT etc., if data node is not responding we will signal pooler to drop + * the connection. + * It is better to drop and recreate datanode connection then wait for several + * seconds while it being cleaned up when, for example, cancelling query. + */ +#define END_QUERY_TIMEOUT 20 + +typedef struct +{ + xact_callback function; + void *fparams; +} abort_callback_type; + +/* + * Buffer size does not affect performance significantly, just do not allow + * connection buffer grows infinitely + */ +#define COPY_BUFFER_SIZE 8192 +#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024 + +/* + * Flag to track if a temporary object is accessed by the current transaction + */ +static bool temp_object_included = false; +static abort_callback_type dbcleanup_info = { NULL, NULL }; + +static int pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections, + GlobalTransactionId gxid, bool need_tran_block, + bool readOnly, char node_type); + +static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate, + ExecNodes *exec_nodes, + RemoteQueryExecType exec_type, + bool is_global_session); + + +static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection, + RemoteQueryState *remotestate, Snapshot snapshot); + +static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode); +static bool pgxc_node_remote_finish(char *prepareGID, bool commit, + char *nodestring, GlobalTransactionId gxid, + GlobalTransactionId prepare_gxid); +static void pgxc_node_remote_commit(void); +static void pgxc_node_remote_abort(void); +static void pgxc_connections_cleanup(ResponseCombiner *combiner); + +static void pgxc_node_report_error(ResponseCombiner *combiner); + +#define REMOVE_CURR_CONN(combiner) \ + if ((combiner)->current_conn < --((combiner)->conn_count)) \ + { \ + (combiner)->connections[(combiner)->current_conn] = \ + (combiner)->connections[(combiner)->conn_count]; \ + } \ + else \ + (combiner)->current_conn = 0 + +#define MAX_STATEMENTS_PER_TRAN 10 + +/* Variables to collect statistics */ +static int total_transactions = 0; +static int total_statements = 0; +static int total_autocommit = 0; +static int nonautocommit_2pc = 0; +static int autocommit_2pc = 0; +static int current_tran_statements = 0; +static int *statements_per_transaction = NULL; +static int *nodes_per_transaction = NULL; + +/* + * statistics collection: count a statement + */ +static void +stat_statement() +{ + total_statements++; + current_tran_statements++; +} + +/* + * To collect statistics: count a transaction + */ +static void +stat_transaction(int node_count) +{ + total_transactions++; + + if (!statements_per_transaction) + { + statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); + memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); + } + if (current_tran_statements > MAX_STATEMENTS_PER_TRAN) + statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++; + else + statements_per_transaction[current_tran_statements]++; + current_tran_statements = 0; + if (node_count > 0 && node_count <= NumDataNodes) + { + if (!nodes_per_transaction) + { + nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int)); + memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int)); + } + nodes_per_transaction[node_count - 1]++; + } +} + + +/* + * Output collected statistics to the log + */ +static void +stat_log() +{ + elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements); + elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d", + total_autocommit, autocommit_2pc, nonautocommit_2pc); + if (total_transactions) + { + if (statements_per_transaction) + { + int i; + + for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++) + elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)", + i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions); + } + elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)", + MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions); + if (nodes_per_transaction) + { + int i; + + for (i = 0; i < NumDataNodes; i++) + elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)", + i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions); + } + } +} + + +/* + * Create a structure to store parameters needed to combine responses from + * multiple connections as well as state information + */ +void +InitResponseCombiner(ResponseCombiner *combiner, int node_count, + CombineType combine_type) +{ + combiner->node_count = node_count; + combiner->connections = NULL; + combiner->conn_count = 0; + combiner->combine_type = combine_type; + combiner->command_complete_count = 0; + combiner->request_type = REQUEST_TYPE_NOT_DEFINED; + combiner->description_count = 0; + combiner->copy_in_count = 0; + combiner->copy_out_count = 0; + combiner->copy_file = NULL; + combiner->errorMessage = NULL; + combiner->errorDetail = NULL; + combiner->errorHint = NULL; + combiner->tuple_desc = NULL; + combiner->probing_primary = false; + combiner->returning_node = InvalidOid; + combiner->currentRow = NULL; + combiner->rowBuffer = NIL; + combiner->tapenodes = NULL; + combiner->merge_sort = false; + combiner->extended_query = false; + combiner->tapemarks = NULL; + combiner->tuplesortstate = NULL; + combiner->cursor = NULL; + combiner->update_cursor = NULL; + combiner->cursor_count = 0; + combiner->cursor_connections = NULL; + combiner->remoteCopyType = REMOTE_COPY_NONE; +} + + +/* + * Parse out row count from the command status response and convert it to integer + */ +static int +parse_row_count(const char *message, size_t len, uint64 *rowcount) +{ + int digits = 0; + int pos; + + *rowcount = 0; + /* skip \0 string terminator */ + for (pos = 0; pos < len - 1; pos++) + { + if (message[pos] >= '0' && message[pos] <= '9') + { + *rowcount = *rowcount * 10 + message[pos] - '0'; + digits++; + } + else + { + *rowcount = 0; + digits = 0; + } + } + return digits; +} + +/* + * Convert RowDescription message to a TupleDesc + */ +static TupleDesc +create_tuple_desc(char *msg_body, size_t len) +{ + TupleDesc result; + int i, nattr; + uint16 n16; + + /* get number of attributes */ + memcpy(&n16, msg_body, 2); + nattr = ntohs(n16); + msg_body += 2; + + result = CreateTemplateTupleDesc(nattr, false); + + /* decode attributes */ + for (i = 1; i <= nattr; i++) + { + AttrNumber attnum; + char *attname; + char *typname; + Oid oidtypeid; + int32 typemode, typmod; + + attnum = (AttrNumber) i; + + /* attribute name */ + attname = msg_body; + msg_body += strlen(attname) + 1; + + /* type name */ + typname = msg_body; + msg_body += strlen(typname) + 1; + + /* table OID, ignored */ + msg_body += 4; + + /* column no, ignored */ + msg_body += 2; + + /* data type OID, ignored */ + msg_body += 4; + + /* type len, ignored */ + msg_body += 2; + + /* type mod */ + memcpy(&typemode, msg_body, 4); + typmod = ntohl(typemode); + msg_body += 4; + + /* PGXCTODO text/binary flag? */ + msg_body += 2; + + /* Get the OID type and mode type from typename */ + parseTypeString(typname, &oidtypeid, NULL, false); + + TupleDescInitEntry(result, attnum, attname, oidtypeid, typmod, 0); + } + return result; +} + +/* + * Handle CopyOutCommandComplete ('c') message from a Datanode connection + */ +static void +HandleCopyOutComplete(ResponseCombiner *combiner) +{ + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COPY_OUT; + if (combiner->request_type != REQUEST_TYPE_COPY_OUT) + /* Inconsistent responses */ + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the Datanodes for 'c' message, current request type %d", combiner->request_type))); + /* Just do nothing, close message is managed by the Coordinator */ + combiner->copy_out_count++; +} + +/* + * Handle CommandComplete ('C') message from a Datanode connection + */ +static void +HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn) +{ + int digits = 0; + EState *estate = combiner->ss.ps.state; + + /* + * If we did not receive description we are having rowcount or OK response + */ + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COMMAND; + /* Extract rowcount */ + if (combiner->combine_type != COMBINE_TYPE_NONE && estate) + { + uint64 rowcount; + digits = parse_row_count(msg_body, len, &rowcount); + if (digits > 0) + { + /* Replicated write, make sure they are the same */ + if (combiner->combine_type == COMBINE_TYPE_SAME) + { + if (combiner->command_complete_count) + { + /* + * Replicated command may succeed on on node and fail on + * another. The example is if distributed table referenced + * by a foreign key constraint defined on a partitioned + * table. If command deletes rows from the replicated table + * they may be referenced on one Datanode but not on other. + * So, replicated command on each Datanode either affects + * proper number of rows, or returns error. Here if + * combiner got an error already, we allow to report it, + * not the scaring data corruption message. + */ + if (combiner->errorMessage == NULL && rowcount != estate->es_processed) + /* There is a consistency issue in the database with the replicated table */ + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Write to replicated table returned different results from the Datanodes"))); + } + else + /* first result */ + estate->es_processed = rowcount; + } + else + estate->es_processed += rowcount; + } + else + combiner->combine_type = COMBINE_TYPE_NONE; + } + + /* If response checking is enable only then do further processing */ + if (conn->ck_resp_rollback) + { + if (strcmp(msg_body, "ROLLBACK") == 0) + { + /* + * Subsequent clean up routine will be checking this flag + * to determine nodes where to send ROLLBACK PREPARED. + * On current node PREPARE has failed and the two-phase record + * does not exist, so clean this flag as if PREPARE was not sent + * to that node and avoid erroneous command. + */ + conn->ck_resp_rollback = false; + /* + * Set the error, if none, to force throwing. + * If there is error already, it will be thrown anyway, do not add + * this potentially confusing message + */ + if (combiner->errorMessage == NULL) + { + MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext); + combiner->errorMessage = + pstrdup("unexpected ROLLBACK from remote node"); + MemoryContextSwitchTo(oldcontext); + /* + * ERRMSG_PRODUCER_ERROR + * Messages with this code are replaced by others, if they are + * received, so if node will send relevant error message that + * one will be replaced. + */ + combiner->errorCode[0] = 'X'; + combiner->errorCode[1] = 'X'; + combiner->errorCode[2] = '0'; + combiner->errorCode[3] = '1'; + combiner->errorCode[4] = '0'; + } + } + } + combiner->command_complete_count++; +} + +/* + * Handle RowDescription ('T') message from a Datanode connection + */ +static bool +HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len) +{ + if (combiner->request_type == REQUEST_TYPE_ERROR) + return false; + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_QUERY; + if (combiner->request_type != REQUEST_TYPE_QUERY) + { + /* Inconsistent responses */ + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type))); + } + /* Increment counter and check if it was first */ + if (combiner->description_count++ == 0) + { + combiner->tuple_desc = create_tuple_desc(msg_body, len); + return true; + } + return false; +} + + +/* + * Handle CopyInResponse ('G') message from a Datanode connection + */ +static void +HandleCopyIn(ResponseCombiner *combiner) +{ + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COPY_IN; + if (combiner->request_type != REQUEST_TYPE_COPY_IN) + { + /* Inconsistent responses */ + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the Datanodes for 'G' message, current request type %d", combiner->request_type))); + } + /* + * The normal PG code will output an G message when it runs in the + * Coordinator, so do not proxy message here, just count it. + */ + combiner->copy_in_count++; +} + +/* + * Handle CopyOutResponse ('H') message from a Datanode connection + */ +static void +HandleCopyOut(ResponseCombiner *combiner) +{ + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COPY_OUT; + if (combiner->request_type != REQUEST_TYPE_COPY_OUT) + { + /* Inconsistent responses */ + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the Datanodes for 'H' message, current request type %d", combiner->request_type))); + } + /* + * The normal PG code will output an H message when it runs in the + * Coordinator, so do not proxy message here, just count it. + */ + combiner->copy_out_count++; +} + +/* + * Handle CopyOutDataRow ('d') message from a Datanode connection + */ +static void +HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len) +{ + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + combiner->request_type = REQUEST_TYPE_COPY_OUT; + + /* Inconsistent responses */ + if (combiner->request_type != REQUEST_TYPE_COPY_OUT) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the Datanodes for 'd' message, current request type %d", combiner->request_type))); + + /* count the row */ + combiner->processed++; + + /* Output remote COPY operation to correct location */ + switch (combiner->remoteCopyType) + { + case REMOTE_COPY_FILE: + /* Write data directly to file */ + fwrite(msg_body, 1, len, combiner->copy_file); + break; + case REMOTE_COPY_STDOUT: + /* Send back data to client */ + pq_putmessage('d', msg_body, len); + break; + case REMOTE_COPY_TUPLESTORE: + /* + * Do not store trailing \n character. + * When tuplestore data are loaded to a table it automatically + * inserts line ends. + */ + tuplestore_putmessage(combiner->tuplestorestate, len-1, msg_body); + break; + case REMOTE_COPY_NONE: + default: + Assert(0); /* Should not happen */ + } +} + +/* + * Handle DataRow ('D') message from a Datanode connection + * The function returns true if data row is accepted and successfully stored + * within the combiner. + */ +static bool +HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node) +{ + /* We expect previous message is consumed */ + Assert(combiner->currentRow == NULL); + + if (combiner->request_type == REQUEST_TYPE_ERROR) + return false; + + if (combiner->request_type != REQUEST_TYPE_QUERY) + { + /* Inconsistent responses */ + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type))); + } + + /* + * If we got an error already ignore incoming data rows from other nodes + * Still we want to continue reading until get CommandComplete + */ + if (combiner->errorMessage) + return false; + + /* + * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples + * from one node, skip others as duplicates + */ + if (combiner->combine_type == COMBINE_TYPE_SAME) + { + /* Do not return rows when probing primary, instead return when doing + * first normal node. Just save some CPU and traffic in case if + * probing fails. + */ + if (combiner->probing_primary) + return false; + if (OidIsValid(combiner->returning_node)) + { + if (combiner->returning_node != node) + return false; + } + else + combiner->returning_node = node; + } + + /* + * We are copying message because it points into connection buffer, and + * will be overwritten on next socket read + */ + combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len); + memcpy(combiner->currentRow->msg, msg_body, len); + combiner->currentRow->msglen = len; + combiner->currentRow->msgnode = node; + + return true; +} + +/* + * Handle ErrorResponse ('E') message from a Datanode connection + */ +static void +HandleError(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn) +{ + /* parse error message */ + char *code = NULL; + char *message = NULL; + char *detail = NULL; + char *hint = NULL; + int offset = 0; + + /* + * Scan until point to terminating \0 + */ + while (offset + 1 < len) + { + /* pointer to the field message */ + char *str = msg_body + offset + 1; + + switch (msg_body[offset]) + { + case 'C': /* code */ + code = str; + break; + case 'M': /* message */ + message = str; + break; + case 'D': /* details */ + detail = str; + break; + + case 'H': /* hint */ + hint = str; + break; + + /* Fields not yet in use */ + case 'S': /* severity */ + case 'R': /* routine */ + case 'P': /* position string */ + case 'p': /* position int */ + case 'q': /* int query */ + case 'W': /* where */ + case 'F': /* file */ + case 'L': /* line */ + default: + break; + } + + /* code, message and \0 */ + offset += strlen(str) + 2; + } + + /* + * We may have special handling for some errors, default handling is to + * throw out error with the same message. We can not ereport immediately + * because we should read from this and other connections until + * ReadyForQuery is received, so we just store the error message. + * If multiple connections return errors only first one is reported. + * + * The producer error may be hiding primary error, so if previously received + * error is a producer error allow it to be overwritten. + */ + if (combiner->errorMessage == NULL || + MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1], + combiner->errorCode[2], combiner->errorCode[3], + combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR) + { + MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext); + combiner->errorMessage = pstrdup(message); + /* Error Code is exactly 5 significant bytes */ + if (code) + memcpy(combiner->errorCode, code, 5); + if (detail) + combiner->errorDetail = pstrdup(detail); + if (hint) + combiner->errorHint = pstrdup(hint); + MemoryContextSwitchTo(oldcontext); + } + + /* + * If the PREPARE TRANSACTION command fails for whatever reason, we don't + * want to send down ROLLBACK PREPARED to this node. Otherwise, it may end + * up rolling back an unrelated prepared transaction with the same GID as + * used by this transaction + */ + if (conn->ck_resp_rollback) + conn->ck_resp_rollback = false; + + /* + * If Datanode have sent ErrorResponse it will never send CommandComplete. + * Increment the counter to prevent endless waiting for it. + */ + combiner->command_complete_count++; +} + +/* + * HandleCmdComplete - + * combine deparsed sql statements execution results + * + * Input parameters: + * commandType is dml command type + * combineTag is used to combine the completion result + * msg_body is execution result needed to combine + * len is msg_body size + */ +void +HandleCmdComplete(CmdType commandType, CombineTag *combine, + const char *msg_body, size_t len) +{ + int digits = 0; + uint64 originrowcount = 0; + uint64 rowcount = 0; + uint64 total = 0; + + if (msg_body == NULL) + return; + + /* if there's nothing in combine, just copy the msg_body */ + if (strlen(combine->data) == 0) + { + strcpy(combine->data, msg_body); + combine->cmdType = commandType; + return; + } + else + { + /* commandType is conflict */ + if (combine->cmdType != commandType) + return; + + /* get the processed row number from msg_body */ + digits = parse_row_count(msg_body, len + 1, &rowcount); + elog(DEBUG1, "digits is %d\n", digits); + Assert(digits >= 0); + + /* no need to combine */ + if (digits == 0) + return; + + /* combine the processed row number */ + parse_row_count(combine->data, strlen(combine->data) + 1, &originrowcount); + elog(DEBUG1, "originrowcount is %lu, rowcount is %lu\n", originrowcount, rowcount); + total = originrowcount + rowcount; + + } + + /* output command completion tag */ + switch (commandType) + { + case CMD_SELECT: + strcpy(combine->data, "SELECT"); + break; + case CMD_INSERT: + snprintf(combine->data, COMPLETION_TAG_BUFSIZE, + "INSERT %u %lu", 0, total); + break; + case CMD_UPDATE: + snprintf(combine->data, COMPLETION_TAG_BUFSIZE, + "UPDATE %lu", total); + break; + case CMD_DELETE: + snprintf(combine->data, COMPLETION_TAG_BUFSIZE, + "DELETE %lu", total); + break; + default: + strcpy(combine->data, ""); + break; + } + +} + +/* + * HandleDatanodeCommandId ('M') message from a Datanode connection + */ +static void +HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len) +{ + uint32 n32; + CommandId cid; + + Assert(msg_body != NULL); + Assert(len >= 2); + + /* Get the command Id */ + memcpy(&n32, &msg_body[0], 4); + cid = ntohl(n32); + + /* If received command Id is higher than current one, set it to a new value */ + if (cid > GetReceivedCommandId()) + SetReceivedCommandId(cid); +} + +/* + * Record waited-for XIDs received from the remote nodes into the transaction + * state + */ +static void +HandleWaitXids(char *msg_body, size_t len) +{ + int xid_count; + uint32 n32; + int cur; + int i; + + /* Get the xid count */ + xid_count = len / sizeof (TransactionId); + + cur = 0; + for (i = 0; i < xid_count; i++) + { + Assert(cur < len); + memcpy(&n32, &msg_body[cur], sizeof (TransactionId)); + cur = cur + sizeof (TransactionId); + TransactionRecordXidWait(ntohl(n32)); + } +} + +static void +HandleGlobalTransactionId(char *msg_body, size_t len) +{ + GlobalTransactionId xid; + + Assert(len == sizeof (GlobalTransactionId)); + memcpy(&xid, &msg_body[0], sizeof (GlobalTransactionId)); + + SetTopTransactionId(xid); +} + +/* + * Examine the specified combiner state and determine if command was completed + * successfully + */ +static bool +validate_combiner(ResponseCombiner *combiner) +{ + /* There was error message while combining */ + if (combiner->errorMessage) + return false; + /* Check if state is defined */ + if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) + return false; + + /* Check all nodes completed */ + if ((combiner->request_type == REQUEST_TYPE_COMMAND + || combiner->request_type == REQUEST_TYPE_QUERY) + && combiner->command_complete_count != combiner->node_count) + return false; + + /* Check count of description responses */ + if (combiner->request_type == REQUEST_TYPE_QUERY + && combiner->description_count != combiner->node_count) + return false; + + /* Check count of copy-in responses */ + if (combiner->request_type == REQUEST_TYPE_COPY_IN + && combiner->copy_in_count != combiner->node_count) + return false; + + /* Check count of copy-out responses */ + if (combiner->request_type == REQUEST_TYPE_COPY_OUT + && combiner->copy_out_count != combiner->node_count) + return false; + + /* Add other checks here as needed */ + + /* All is good if we are here */ + return true; +} + +/* + * Close combiner and free allocated memory, if it is not needed + */ +void +CloseCombiner(ResponseCombiner *combiner) +{ + if (combiner->connections) + pfree(combiner->connections); + if (combiner->tuple_desc) + FreeTupleDesc(combiner->tuple_desc); + if (combiner->errorMessage) + pfree(combiner->errorMessage); + if (combiner->errorDetail) + pfree(combiner->errorDetail); + if (combiner->errorHint) + pfree(combiner->errorHint); + if (combiner->cursor_connections) + pfree(combiner->cursor_connections); + if (combiner->tapenodes) + pfree(combiner->tapenodes); + if (combiner->tapemarks) + pfree(combiner->tapemarks); +} + +/* + * Validate combiner and release storage freeing allocated memory + */ +static bool +ValidateAndCloseCombiner(ResponseCombiner *combiner) +{ + bool valid = validate_combiner(combiner); + + CloseCombiner(combiner); + + return valid; +} + +/* + * It is possible if multiple steps share the same Datanode connection, when + * executor is running multi-step query or client is running multiple queries + * using Extended Query Protocol. After returning next tuple ExecRemoteQuery + * function passes execution control to the executor and then it can be given + * to the same RemoteQuery or to different one. It is possible that before + * returning a tuple the function do not read all Datanode responses. In this + * case pending responses should be read in context of original RemoteQueryState + * till ReadyForQuery message and data rows should be stored (buffered) to be + * available when fetch from that RemoteQueryState is requested again. + * BufferConnection function does the job. + * If a RemoteQuery is going to use connection it should check connection state. + * DN_CONNECTION_STATE_QUERY indicates query has data to read and combiner + * points to the original RemoteQueryState. If combiner differs from "this" the + * connection should be buffered. + */ +void +BufferConnection(PGXCNodeHandle *conn) +{ + ResponseCombiner *combiner = conn->combiner; + MemoryContext oldcontext; + + if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY) + return; + + elog(DEBUG2, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor); + + /* + * When BufferConnection is invoked CurrentContext is related to other + * portal, which is trying to control the connection. + * TODO See if we can find better context to switch to + */ + oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt); + + /* Verify the connection is in use by the combiner */ + combiner->current_conn = 0; + while (combiner->current_conn < combiner->conn_count) + { + if (combiner->connections[combiner->current_conn] == conn) + break; + combiner->current_conn++; + } + Assert(combiner->current_conn < combiner->conn_count); + + if (combiner->tapemarks == NULL) + combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*)); + + /* + * If current bookmark for the current tape is not set it means either + * first row in the buffer is from the current tape or no rows from + * the tape in the buffer, so if first row is not from current + * connection bookmark the last cell in the list. + */ + if (combiner->tapemarks[combiner->current_conn] == NULL && + list_length(combiner->rowBuffer) > 0) + { + RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer); + if (dataRow->msgnode != conn->nodeoid) + combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer); + } + + /* + * Buffer data rows until data node return number of rows specified by the + * fetch_size parameter of last Execute message (PortalSuspended message) + * or end of result set is reached (CommandComplete message) + */ + while (true) + { + int res; + + /* Move to buffer currentRow (received from the data node) */ + if (combiner->currentRow) + { + combiner->rowBuffer = lappend(combiner->rowBuffer, + combiner->currentRow); + combiner->currentRow = NULL; + } + + res = handle_response(conn, combiner); + /* + * If response message is a DataRow it will be handled on the next + * iteration. + * PortalSuspended will cause connection state change and break the loop + * The same is for CommandComplete, but we need additional handling - + * remove connection from the list of active connections. + * We may need to add handling error response + */ + + /* Most often result check first */ + if (res == RESPONSE_DATAROW) + { + /* + * The row is in the combiner->currentRow, on next iteration it will + * be moved to the buffer + */ + continue; + } + + /* incomplete message, read more */ + if (res == RESPONSE_EOF) + { + if (pgxc_node_receive(1, &conn, NULL)) + { + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + add_error_message(conn, "Failed to fetch from data node"); + } + } + + /* + * End of result set is reached, so either set the pointer to the + * connection to NULL (combiner with sort) or remove it from the list + * (combiner without sort) + */ + else if (res == RESPONSE_COMPLETE) + { + /* + * If combiner is doing merge sort we should set reference to the + * current connection to NULL in the array, indicating the end + * of the tape is reached. FetchTuple will try to access the buffer + * first anyway. + * Since we remove that reference we can not determine what node + * number was this connection, but we need this info to find proper + * tuple in the buffer if we are doing merge sort. So store node + * number in special array. + * NB: We can not test if combiner->tuplesortstate is set here: + * connection may require buffering inside tuplesort_begin_merge + * - while pre-read rows from the tapes, one of the tapes may be + * the local connection with RemoteSubplan in the tree. The + * combiner->tuplesortstate is set only after tuplesort_begin_merge + * returns. + */ + if (combiner->merge_sort) + { + combiner->connections[combiner->current_conn] = NULL; + if (combiner->tapenodes == NULL) + combiner->tapenodes = (Oid *) + palloc0(combiner->conn_count * sizeof(Oid)); + combiner->tapenodes[combiner->current_conn] = conn->nodeoid; + } + else + { + /* Remove current connection, move last in-place, adjust current_conn */ + if (combiner->current_conn < --combiner->conn_count) + combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count]; + else + combiner->current_conn = 0; + } + /* + * If combiner runs Simple Query Protocol we need to read in + * ReadyForQuery. In case of Extended Query Protocol it is not + * sent and we should quit. + */ + if (combiner->extended_query) + break; + } + else if (res == RESPONSE_ERROR) + { + if (combiner->extended_query) + { + /* + * Need to sync connection to enable receiving commands + * by the datanode + */ + if (pgxc_node_send_sync(conn) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to sync msg to node %u", conn->nodeoid))); + } + } + } + else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY) + { + /* Now it is OK to quit */ + break; + } + } + Assert(conn->state != DN_CONNECTION_STATE_QUERY); + MemoryContextSwitchTo(oldcontext); + conn->combiner = NULL; +} + +/* + * copy the datarow from combiner to the given slot, in the slot's memory + * context + */ +static void +CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot) +{ + RemoteDataRow datarow; + MemoryContext oldcontext; + oldcontext = MemoryContextSwitchTo(slot->tts_mcxt); + datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen); + datarow->msgnode = combiner->currentRow->msgnode; + datarow->msglen = combiner->currentRow->msglen; + memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen); + ExecStoreDataRowTuple(datarow, slot, true); + pfree(combiner->currentRow); + combiner->currentRow = NULL; + MemoryContextSwitchTo(oldcontext); +} + + +/* + * FetchTuple + * + Get next tuple from one of the datanode connections. + * The connections should be in combiner->connections, if "local" dummy + * connection presents it should be the last active connection in the array. + * If combiner is set up to perform merge sort function returns tuple from + * connection defined by combiner->current_conn, or NULL slot if no more tuple + * are available from the connection. Otherwise it returns tuple from any + * connection or NULL slot if no more available connections. + * Function looks into combiner->rowBuffer before accessing connection + * and return a tuple from there if found. + * Function may wait while more data arrive from the data nodes. If there + * is a locally executed subplan function advance it and buffer resulting rows + * instead of waiting. + */ +TupleTableSlot * +FetchTuple(ResponseCombiner *combiner) +{ + PGXCNodeHandle *conn; + TupleTableSlot *slot; + Oid nodeOid = -1; + + /* + * Case if we run local subplan. + * We do not have remote connections, so just get local tuple and return it + */ + if (outerPlanState(combiner)) + { + RemoteSubplanState *planstate = (RemoteSubplanState *) combiner; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + /* Advance subplan in a loop until we have something to return */ + for (;;) + { + Datum value = (Datum) 0; + bool isnull; + int numnodes; + int i; + + slot = ExecProcNode(outerPlanState(combiner)); + /* If locator is not defined deliver all the results */ + if (planstate->locator == NULL) + return slot; + + /* + * If NULL tuple is returned we done with the subplan, finish it up and + * return NULL + */ + if (TupIsNull(slot)) + return NULL; + + /* Get partitioning value if defined */ + if (plan->distributionKey != InvalidAttrNumber) + value = slot_getattr(slot, plan->distributionKey, &isnull); + + /* Determine target nodes */ + numnodes = GET_NODES(planstate->locator, value, isnull, NULL); + for (i = 0; i < numnodes; i++) + { + /* Deliver the node */ + if (planstate->dest_nodes[i] == PGXCNodeId-1) + return slot; + } + } + } + + /* + * Get current connection + */ + if (combiner->conn_count > combiner->current_conn) + conn = combiner->connections[combiner->current_conn]; + else + conn = NULL; + + /* + * If doing merge sort determine the node number. + * It may be needed to get buffered row. + */ + if (combiner->merge_sort) + { + Assert(conn || combiner->tapenodes); + nodeOid = conn ? conn->nodeoid : + combiner->tapenodes[combiner->current_conn]; + Assert(OidIsValid(nodeOid)); + } + + /* + * First look into the row buffer. + * When we are performing merge sort we need to get from the buffer record + * from the connection marked as "current". Otherwise get first. + */ + if (list_length(combiner->rowBuffer) > 0) + { + RemoteDataRow dataRow; + + Assert(combiner->currentRow == NULL); + + if (combiner->merge_sort) + { + ListCell *lc; + ListCell *prev; + + elog(DEBUG1, "Getting buffered tuple from node %x", nodeOid); + + prev = combiner->tapemarks[combiner->current_conn]; + if (prev) + { + /* + * Start looking through the list from the bookmark. + * Probably the first cell we check contains row from the needed + * node. Otherwise continue scanning until we encounter one, + * advancing prev pointer as well. + */ + while((lc = lnext(prev)) != NULL) + { + dataRow = (RemoteDataRow) lfirst(lc); + if (dataRow->msgnode == nodeOid) + { + combiner->currentRow = dataRow; + break; + } + prev = lc; + } + } + else + { + /* + * Either needed row is the first in the buffer or no such row + */ + lc = list_head(combiner->rowBuffer); + dataRow = (RemoteDataRow) lfirst(lc); + if (dataRow->msgnode == nodeOid) + combiner->currentRow = dataRow; + else + lc = NULL; + } + if (lc) + { + /* + * Delete cell from the buffer. Before we delete we must check + * the bookmarks, if the cell is a bookmark for any tape. + * If it is the case we are deleting last row of the current + * block from the current tape. That tape should have bookmark + * like current, and current bookmark will be advanced when we + * read the tape once again. + */ + int i; + for (i = 0; i < combiner->conn_count; i++) + { + if (combiner->tapemarks[i] == lc) + combiner->tapemarks[i] = prev; + } + elog(DEBUG1, "Found buffered tuple from node %x", nodeOid); + combiner->rowBuffer = list_delete_cell(combiner->rowBuffer, + lc, prev); + } + elog(DEBUG1, "Update tapemark"); + combiner->tapemarks[combiner->current_conn] = prev; + } + else + { + dataRow = (RemoteDataRow) linitial(combiner->rowBuffer); + combiner->currentRow = dataRow; + combiner->rowBuffer = list_delete_first(combiner->rowBuffer); + } + } + + /* If we have node message in the currentRow slot, and it is from a proper + * node, consume it. */ + if (combiner->currentRow) + { + Assert(!combiner->merge_sort || + combiner->currentRow->msgnode == nodeOid); + slot = combiner->ss.ps.ps_ResultTupleSlot; + CopyDataRowTupleToSlot(combiner, slot); + return slot; + } + + while (conn) + { + int res; + + /* Going to use a connection, buffer it if needed */ + CHECK_OWNERSHIP(conn, combiner); + + /* + * If current connection is idle it means portal on the data node is + * suspended. Request more and try to get it + */ + if (combiner->extended_query && + conn->state == DN_CONNECTION_STATE_IDLE) + { + /* + * We do not allow to suspend if querying primary node, so that + * only may mean the current node is secondary and subplan was not + * executed there yet. Return and go on with second phase. + */ + if (combiner->probing_primary) + { + return NULL; + } + + if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid))); + } + + if (pgxc_node_send_flush(conn) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid))); + } + + if (pgxc_node_receive(1, &conn, NULL)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed receive data from node %u cursor '%s'", conn->nodeoid, combiner->cursor))); + } + } + + /* read messages */ + res = handle_response(conn, combiner); + if (res == RESPONSE_DATAROW) + { + slot = combiner->ss.ps.ps_ResultTupleSlot; + CopyDataRowTupleToSlot(combiner, slot); + return slot; + } + else if (res == RESPONSE_EOF) + { + /* incomplete message, read more */ + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to receive more data from data node %u", conn->nodeoid))); + continue; + } + else if (res == RESPONSE_SUSPENDED) + { + /* + * If we are doing merge sort or probing primary node we should + * remain on the same node, so query next portion immediately. + * Otherwise leave node suspended and fetch lazily. + */ + if (combiner->merge_sort || combiner->probing_primary) + { + if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid))); + if (pgxc_node_send_flush(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid))); + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed receive node from node %u cursor '%s'", conn->nodeoid, combiner->cursor))); + continue; + } + + /* + * Tell the node to fetch data in background, next loop when we + * pgxc_node_receive, data is already there, so we can run faster + * */ + if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid))); + } + + if (pgxc_node_send_flush(conn) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid))); + } + + if (++combiner->current_conn >= combiner->conn_count) + combiner->current_conn = 0; + conn = combiner->connections[combiner->current_conn]; + } + else if (res == RESPONSE_COMPLETE) + { + /* + * In case of Simple Query Protocol we should receive ReadyForQuery + * before removing connection from the list. In case of Extended + * Query Protocol we may remove connection right away. + */ + if (combiner->extended_query) + { + /* If we are doing merge sort clean current connection and return + * NULL, otherwise remove current connection, move last in-place, + * adjust current_conn and continue if it is not last connection */ + if (combiner->merge_sort) + { + combiner->connections[combiner->current_conn] = NULL; + return NULL; + } + REMOVE_CURR_CONN(combiner); + if (combiner->conn_count > 0) + conn = combiner->connections[combiner->current_conn]; + else + return NULL; + } + } + else if (res == RESPONSE_ERROR) + { + /* + * If doing Extended Query Protocol we need to sync connection, + * otherwise subsequent commands will be ignored. + */ + if (combiner->extended_query) + { + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to sync msg to node %u", conn->nodeoid))); + } + /* + * Do not wait for response from primary, it needs to wait + * for other nodes to respond. Instead go ahead and send query to + * other nodes. It will fail there, but we can continue with + * normal cleanup. + */ + if (combiner->probing_primary) + { + REMOVE_CURR_CONN(combiner); + return NULL; + } + } + else if (res == RESPONSE_READY) + { + /* If we are doing merge sort clean current connection and return + * NULL, otherwise remove current connection, move last in-place, + * adjust current_conn and continue if it is not last connection */ + if (combiner->merge_sort) + { + combiner->connections[combiner->current_conn] = NULL; + return NULL; + } + REMOVE_CURR_CONN(combiner); + if (combiner->conn_count > 0) + conn = combiner->connections[combiner->current_conn]; + else + return NULL; + } + else if (res == RESPONSE_TUPDESC) + { + ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot, + combiner->tuple_desc); + /* Now slot is responsible for freeng the descriptor */ + combiner->tuple_desc = NULL; + } + else if (res == RESPONSE_ASSIGN_GXID) + { + /* Do nothing. It must have been handled in handle_response() */ + } + else + { + // Can not get here? + Assert(false); + } + } + + return NULL; +} + + +/* + * Handle responses from the Datanode connections + */ +static int +pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, + struct timeval * timeout, ResponseCombiner *combiner) +{ + int count = conn_count; + PGXCNodeHandle *to_receive[conn_count]; + + /* make a copy of the pointers to the connections */ + memcpy(to_receive, connections, conn_count * sizeof(PGXCNodeHandle *)); + + /* + * Read results. + * Note we try and read from Datanode connections even if there is an error on one, + * so as to avoid reading incorrect results on the next statement. + * Other safegaurds exist to avoid this, however. + */ + while (count > 0) + { + int i = 0; + + if (pgxc_node_receive(count, to_receive, timeout)) + return EOF; + while (i < count) + { + int result = handle_response(to_receive[i], combiner); + switch (result) + { + case RESPONSE_EOF: /* have something to read, keep receiving */ + i++; + break; + case RESPONSE_COMPLETE: + if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL) + /* Continue read until ReadyForQuery */ + break; + /* fallthru */ + case RESPONSE_READY: + /* fallthru */ + case RESPONSE_COPY: + /* Handling is done, do not track this connection */ + count--; + /* Move last connection in place */ + if (i < count) + to_receive[i] = to_receive[count]; + break; + case RESPONSE_ERROR: + /* no handling needed, just wait for ReadyForQuery */ + break; + + case RESPONSE_WAITXIDS: + break; + + case RESPONSE_ASSIGN_GXID: + break; + + default: + /* Inconsistent responses */ + add_error_message(to_receive[i], "Unexpected response from the Datanodes"); + elog(ERROR, "Unexpected response from the Datanodes, result = %d, request type %d", result, combiner->request_type); + /* Stop tracking and move last connection in place */ + count--; + if (i < count) + to_receive[i] = to_receive[count]; + } + } + } + + return 0; +} + +/* + * Read next message from the connection and update the combiner + * and connection state accordingly + * If we are in an error state we just consume the messages, and do not proxy + * Long term, we should look into cancelling executing statements + * and closing the connections. + * It returns if states need to be handled + * Return values: + * RESPONSE_EOF - need to receive more data for the connection + * RESPONSE_READY - got ReadyForQuery + * RESPONSE_COMPLETE - done with the connection, but not yet ready for query. + * Also this result is output in case of error + * RESPONSE_SUSPENDED - got PortalSuspended + * RESPONSE_TUPLEDESC - got tuple description + * RESPONSE_DATAROW - got data row + * RESPONSE_COPY - got copy response + * RESPONSE_BARRIER_OK - barrier command completed successfully + */ +int +handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner) +{ + char *msg; + int msg_len; + char msg_type; + + for (;;) + { + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + + /* + * Don't read from from the connection if there is a fatal error. + * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since + * Handling of RESPONSE_ERROR assumes sending SYNC message, but + * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is + * not usable. + */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + return RESPONSE_COMPLETE; + + /* No data available, exit */ + if (!HAS_MESSAGE_BUFFERED(conn)) + return RESPONSE_EOF; + + Assert(conn->combiner == combiner || conn->combiner == NULL); + + /* TODO handle other possible responses */ + msg_type = get_message(conn, &msg_len, &msg); + switch (msg_type) + { + case '\0': /* Not enough data in the buffer */ + return RESPONSE_EOF; + case 'c': /* CopyToCommandComplete */ + HandleCopyOutComplete(combiner); + break; + case 'C': /* CommandComplete */ + HandleCommandComplete(combiner, msg, msg_len, conn); + conn->combiner = NULL; + if (conn->state == DN_CONNECTION_STATE_QUERY) + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_COMPLETE; + case 'T': /* RowDescription */ +#ifdef DN_CONNECTION_DEBUG + Assert(!conn->have_row_desc); + conn->have_row_desc = true; +#endif + if (HandleRowDescription(combiner, msg, msg_len)) + return RESPONSE_TUPDESC; + break; + case 'D': /* DataRow */ +#ifdef DN_CONNECTION_DEBUG + Assert(conn->have_row_desc); +#endif + /* Do not return if data row has not been actually handled */ + if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid)) + return RESPONSE_DATAROW; + break; + case 's': /* PortalSuspended */ + /* No activity is expected on the connection until next query */ + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; + return RESPONSE_SUSPENDED; + case '1': /* ParseComplete */ + case '2': /* BindComplete */ + case '3': /* CloseComplete */ + case 'n': /* NoData */ + /* simple notifications, continue reading */ + break; + case 'G': /* CopyInResponse */ + conn->state = DN_CONNECTION_STATE_COPY_IN; + HandleCopyIn(combiner); + /* Done, return to caller to let it know the data can be passed in */ + return RESPONSE_COPY; + case 'H': /* CopyOutResponse */ + conn->state = DN_CONNECTION_STATE_COPY_OUT; + HandleCopyOut(combiner); + return RESPONSE_COPY; + case 'd': /* CopyOutDataRow */ + conn->state = DN_CONNECTION_STATE_COPY_OUT; + HandleCopyDataRow(combiner, msg, msg_len); + break; + case 'E': /* ErrorResponse */ + HandleError(combiner, msg, msg_len, conn); + add_error_message(conn, combiner->errorMessage); + return RESPONSE_ERROR; + case 'A': /* NotificationResponse */ + case 'N': /* NoticeResponse */ + case 'S': /* SetCommandComplete */ + /* + * Ignore these to prevent multiple messages, one from each + * node. Coordinator will send one for DDL anyway + */ + break; + case 'Z': /* ReadyForQuery */ + { + /* + * Return result depends on previous connection state. + * If it was PORTAL_SUSPENDED Coordinator want to send down + * another EXECUTE to fetch more rows, otherwise it is done + * with the connection + */ + conn->transaction_status = msg[0]; + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; +#ifdef DN_CONNECTION_DEBUG + conn->have_row_desc = false; +#endif + return RESPONSE_READY; + } + case 'M': /* Command Id */ + HandleDatanodeCommandId(combiner, msg, msg_len); + break; + case 'b': + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_BARRIER_OK; + case 'I': /* EmptyQuery */ + return RESPONSE_COMPLETE; + case 'W': + HandleWaitXids(msg, msg_len); + return RESPONSE_WAITXIDS; + case 'x': + HandleGlobalTransactionId(msg, msg_len); + return RESPONSE_ASSIGN_GXID; + default: + /* sync lost? */ + elog(WARNING, "Received unsupported message type: %c", msg_type); + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + /* stop reading */ + return RESPONSE_COMPLETE; + } + } + /* never happen, but keep compiler quiet */ + return RESPONSE_EOF; +} + +/* + * Has the data node sent Ready For Query + */ + +bool +is_data_node_ready(PGXCNodeHandle * conn) +{ + char *msg; + int msg_len; + char msg_type; + + for (;;) + { + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + + /* don't read from from the connection if there is a fatal error */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + return true; + + /* No data available, exit */ + if (!HAS_MESSAGE_BUFFERED(conn)) + return false; + + msg_type = get_message(conn, &msg_len, &msg); + if (msg_type == 'Z') + { + /* + * Return result depends on previous connection state. + * If it was PORTAL_SUSPENDED Coordinator want to send down + * another EXECUTE to fetch more rows, otherwise it is done + * with the connection + */ + conn->transaction_status = msg[0]; + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; + return true; + } + } + /* never happen, but keep compiler quiet */ + return false; +} + + +/* + * Send BEGIN command to the Datanodes or Coordinators and receive responses. + * Also send the GXID for the transaction. + */ +static int +pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, + GlobalTransactionId gxid, bool need_tran_block, + bool readOnly, char node_type) +{ + int i; + struct timeval *timeout = NULL; + ResponseCombiner combiner; + TimestampTz timestamp = GetCurrentGTMStartTimestamp(); + PGXCNodeHandle *new_connections[conn_count]; + int new_count = 0; + char *init_str; + char lxid[13]; + + /* + * If no remote connections, we don't have anything to do + */ + if (conn_count == 0) + return 0; + + for (i = 0; i < conn_count; i++) + { + if (!readOnly && !IsConnFromDatanode()) + connections[i]->read_only = false; + /* + * PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY + * state when we are about to send a BEGIN TRANSACTION command to the + * node. We should consider changing the following to an assert and fix + * any bugs reported + */ + if (connections[i]->state == DN_CONNECTION_STATE_QUERY) + BufferConnection(connections[i]); + + /* Send GXID and check for errors */ + if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid)) + return EOF; + + /* Send timestamp and check for errors */ + if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp)) + return EOF; + + if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid)) + need_tran_block = true; + else if (IS_PGXC_REMOTE_COORDINATOR) + need_tran_block = false; + /* Send BEGIN if not already in transaction */ + if (need_tran_block && connections[i]->transaction_status == 'I') + { + /* Send the BEGIN TRANSACTION command and check for errors */ + if (pgxc_node_send_query(connections[i], "BEGIN")) + return EOF; + + new_connections[new_count++] = connections[i]; + } + } + + /* + * If we did not send a BEGIN command to any node, we are done. Otherwise, + * we need to check for any errors and report them + */ + if (new_count == 0) + return 0; + + InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + + /* Receive responses */ + if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner)) + return EOF; + + /* Verify status */ + if (!ValidateAndCloseCombiner(&combiner)) + return EOF; + + /* Send virtualXID to the remote nodes using SET command */ + sprintf(lxid, "%d", MyProc->lxid); + PGXCNodeSetParam(true, "coordinator_lxid", lxid); + + /* after transactions are started send down local set commands */ + init_str = PGXCNodeGetTransactionParamStr(); + if (init_str) + { + for (i = 0; i < new_count; i++) + { + pgxc_node_set_query(new_connections[i], init_str); + } + } + + /* No problem, let's get going */ + return 0; +} + + +/* + * Execute DISCARD ALL command on all allocated nodes to remove all session + * specific stuff before releasing them to pool for reuse by other sessions. + */ +static void +pgxc_node_remote_cleanup_all(void) +{ + PGXCNodeAllHandles *handles = get_current_handles(); + PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count]; + int new_conn_count = 0; + int i; + char *resetcmd = "RESET ALL;RESET SESSION AUTHORIZATION;" + "RESET transaction_isolation;"; + + /* + * We must handle reader and writer connections both since even a read-only + * needs to be cleaned up. + */ + if (handles->co_conn_count + handles->dn_conn_count == 0) + return; + + /* + * Send down snapshot followed by DISCARD ALL command. + */ + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *handle = handles->coord_handles[i]; + + /* At this point connection should be in IDLE state */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + + /* + * We must go ahead and release connections anyway, so do not throw + * an error if we have a problem here. + */ + if (pgxc_node_send_query(handle, resetcmd)) + { + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to clean up data nodes"))); + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + new_connections[new_conn_count++] = handle; + handle->combiner = NULL; + } + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *handle = handles->datanode_handles[i]; + + /* At this point connection should be in IDLE state */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + + /* + * We must go ahead and release connections anyway, so do not throw + * an error if we have a problem here. + */ + if (pgxc_node_send_query(handle, resetcmd)) + { + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to clean up data nodes"))); + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + new_connections[new_conn_count++] = handle; + handle->combiner = NULL; + } + + if (new_conn_count) + { + ResponseCombiner combiner; + InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner); + CloseCombiner(&combiner); + } + pfree_pgxc_all_handles(handles); +} + + +/* + * Prepare nodes which ran write operations during the transaction. + * Read only remote transactions are committed and connections are released + * back to the pool. + * Function returns the list of nodes where transaction is prepared, including + * local node, if requested, in format expected by the GTM server. + * If something went wrong the function tries to abort prepared transactions on + * the nodes where it succeeded and throws error. A warning is emitted if abort + * prepared fails. + * After completion remote connection handles are released. + */ +static char * +pgxc_node_remote_prepare(char *prepareGID, bool localNode) +{ + bool isOK = true; + StringInfoData nodestr; + char prepare_cmd[256]; + char abort_cmd[256]; + GlobalTransactionId auxXid; + char *commit_cmd = "COMMIT TRANSACTION"; + int i; + ResponseCombiner combiner; + PGXCNodeHandle *connections[MaxDataNodes + MaxCoords]; + int conn_count = 0; + PGXCNodeAllHandles *handles = get_current_handles(); + + initStringInfo(&nodestr); + if (localNode) + appendStringInfoString(&nodestr, PGXCNodeName); + + sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID); + + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* + * If something went wrong already we have nothing to do here. The error + * will be reported at the end of the function, and we will rollback + * remotes as part of the error handling. + * Just skip to clean up section and check if we have already prepared + * somewhere, we should abort that prepared transaction. + */ + if (!isOK) + goto prepare_err; + + /* + * Skip empty slots + */ + if (conn->sock == NO_SOCKET) + continue; + else if (conn->transaction_status == 'T') + { + /* Read in any pending input */ + if (conn->state != DN_CONNECTION_STATE_IDLE) + BufferConnection(conn); + + if (conn->read_only) + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, commit_cmd)) + { + /* + * not a big deal, it was read only, the connection will be + * abandoned later. + */ + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + else + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, prepare_cmd)) + { + /* + * That is the trouble, we really want to prepare it. + * Just emit warning so far and go to clean up. + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send PREPARE TRANSACTION command to " + "the node %u", conn->nodeoid))); + } + else + { + char *nodename = get_pgxc_nodename(conn->nodeoid); + if (nodestr.len > 0) + appendStringInfoChar(&nodestr, ','); + appendStringInfoString(&nodestr, nodename); + /* Read responses from these */ + connections[conn_count++] = conn; + /* + * If it fails on remote node it would just return ROLLBACK. + * Set the flag for the message handler so the response is + * verified. + */ + conn->ck_resp_rollback = true; + } + } + } + else if (conn->transaction_status == 'E') + { + /* + * Probably can not happen, if there was a error the engine would + * abort anyway, even in case of explicit PREPARE. + * Anyway, just in case... + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("remote node %u is in error state", conn->nodeoid))); + } + } + + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + /* + * If something went wrong already we have nothing to do here. The error + * will be reported at the end of the function, and we will rollback + * remotes as part of the error handling. + * Just skip to clean up section and check if we have already prepared + * somewhere, we should abort that prepared transaction. + */ + if (!isOK) + goto prepare_err; + + /* + * Skip empty slots + */ + if (conn->sock == NO_SOCKET) + continue; + else if (conn->transaction_status == 'T') + { + if (conn->read_only) + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, commit_cmd)) + { + /* + * not a big deal, it was read only, the connection will be + * abandoned later. + */ + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + else + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, prepare_cmd)) + { + /* + * That is the trouble, we really want to prepare it. + * Just emit warning so far and go to clean up. + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send PREPARE TRANSACTION command to " + "the node %u", conn->nodeoid))); + } + else + { + char *nodename = get_pgxc_nodename(conn->nodeoid); + if (nodestr.len > 0) + appendStringInfoChar(&nodestr, ','); + appendStringInfoString(&nodestr, nodename); + /* Read responses from these */ + connections[conn_count++] = conn; + /* + * If it fails on remote node it would just return ROLLBACK. + * Set the flag for the message handler so the response is + * verified. + */ + conn->ck_resp_rollback = true; + } + } + } + else if (conn->transaction_status == 'E') + { + /* + * Probably can not happen, if there was a error the engine would + * abort anyway, even in case of explicit PREPARE. + * Anyway, just in case... + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("remote node %u is in error state", conn->nodeoid))); + } + } + + SetSendCommandId(false); + + if (!isOK) + goto prepare_err; + + /* exit if nothing has been prepared */ + if (conn_count > 0) + { + int result; + /* + * Receive and check for any errors. In case of errors, we don't bail out + * just yet. We first go through the list of connections and look for + * errors on each connection. This is important to ensure that we run + * an appropriate ROLLBACK command later on (prepared transactions must be + * rolled back with ROLLBACK PREPARED commands). + * + * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on + * individual connections. The transaction_status field doesn't get set + * every time there is an error on the connection. The combiner mechanism is + * good for parallel proessing, but I think we should have a leak-proof + * mechanism to track connection status + */ + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner); + if (result || !validate_combiner(&combiner)) + goto prepare_err; + else + CloseCombiner(&combiner); + + /* Before exit clean the flag, to avoid unnecessary checks */ + for (i = 0; i < conn_count; i++) + connections[i]->ck_resp_rollback = false; + + pfree_pgxc_all_handles(handles); + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } + } + + return nodestr.data; +prepare_err: + sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID); + + auxXid = GetAuxilliaryTransactionId(); + conn_count = 0; + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* + * PREPARE succeeded on that node, roll it back there + */ + if (conn->ck_resp_rollback) + { + conn->ck_resp_rollback = false; + /* sanity checks */ + Assert(conn->sock != NO_SOCKET); + Assert(conn->state == DN_CONNECTION_STATE_IDLE); + /* Send down abort prepared command */ + if (pgxc_node_send_gxid(conn, auxXid)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send xid to " + "the node %u", conn->nodeoid))); + } + if (pgxc_node_send_query(conn, abort_cmd)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send ABORT PREPARED command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + if (conn->ck_resp_rollback) + { + conn->ck_resp_rollback = false; + /* sanity checks */ + Assert(conn->sock != NO_SOCKET); + Assert(conn->state = DN_CONNECTION_STATE_IDLE); + /* Send down abort prepared command */ + if (pgxc_node_send_gxid(conn, auxXid)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send xid to " + "the node %u", conn->nodeoid))); + } + if (pgxc_node_send_query(conn, abort_cmd)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send ABORT PREPARED command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + if (conn_count > 0) + { + /* Just read out responses, throw error from the first combiner */ + ResponseCombiner combiner2; + InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2); + CloseCombiner(&combiner2); + } + + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } + + pfree_pgxc_all_handles(handles); + + /* + * If the flag is set we are here because combiner carries error message + */ + if (isOK) + pgxc_node_report_error(&combiner); + else + elog(ERROR, "failed to PREPARE transaction on one or more nodes"); + return NULL; +} + + +/* + * Commit transactions on remote nodes. + * If barrier lock is set wait while it is released. + * Release remote connection after completion. + */ +static void +pgxc_node_remote_commit(void) +{ + int result = 0; + char *commitCmd = "COMMIT TRANSACTION"; + int i; + ResponseCombiner combiner; + PGXCNodeHandle *connections[MaxDataNodes + MaxCoords]; + int conn_count = 0; + PGXCNodeAllHandles *handles = get_current_handles(); + + SetSendCommandId(false); + + /* + * Barrier: + * + * We should acquire the BarrierLock in SHARE mode here to ensure that + * there are no in-progress barrier at this point. This mechanism would + * work as long as LWLock mechanism does not starve a EXCLUSIVE lock + * requester + */ + LWLockAcquire(BarrierLock, LW_SHARED); + + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + /* + * We do not need to commit remote node if it is not in transaction. + * If transaction is in error state the commit command will cause + * rollback, that is OK + */ + if (conn->transaction_status != 'I') + { + /* Read in any pending input */ + if (conn->state != DN_CONNECTION_STATE_IDLE) + BufferConnection(conn); + + if (pgxc_node_send_query(conn, commitCmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to the node %u", + conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + /* + * We do not need to commit remote node if it is not in transaction. + * If transaction is in error state the commit command will cause + * rollback, that is OK + */ + if (conn->transaction_status != 'I') + { + if (pgxc_node_send_query(conn, commitCmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to the node %u", + conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + /* + * Release the BarrierLock. + */ + LWLockRelease(BarrierLock); + + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner); + if (result || !validate_combiner(&combiner)) + result = EOF; + else + CloseCombiner(&combiner); + } + + stat_transaction(conn_count); + + if (result) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to COMMIT the transaction on one or more nodes"))); + } + + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } + + pfree_pgxc_all_handles(handles); +} + + +/* + * Rollback transactions on remote nodes. + * Release remote connection after completion. + */ +static void +pgxc_node_remote_abort(void) +{ + int result = 0; + char *rollbackCmd = "ROLLBACK TRANSACTION"; + int i; + ResponseCombiner combiner; + PGXCNodeHandle *connections[MaxDataNodes + MaxCoords]; + int conn_count = 0; + PGXCNodeAllHandles *handles = get_current_handles(); + + SetSendCommandId(false); + + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + if (conn->transaction_status != 'I') + { + /* Read in any pending input */ + if (conn->state != DN_CONNECTION_STATE_IDLE) + BufferConnection(conn); + + /* + * Do not matter, is there committed or failed transaction, + * just send down rollback to finish it. + */ + if (pgxc_node_send_query(conn, rollbackCmd)) + { + add_error_message(conn, + "failed to send ROLLBACK TRANSACTION command"); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + if (conn->transaction_status != 'I') + { + /* + * Do not matter, is there committed or failed transaction, + * just send down rollback to finish it. + */ + if (pgxc_node_send_query(conn, rollbackCmd)) + { + add_error_message(conn, + "failed to send ROLLBACK TRANSACTION command"); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner); + if (result || !validate_combiner(&combiner)) + result = EOF; + else + CloseCombiner(&combiner); + } + + stat_transaction(conn_count); + + if (result) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to ROLLBACK the transaction on one or more nodes"))); + } + + pfree_pgxc_all_handles(handles); +} + +/* + * Begin COPY command + * The copy_connections array must have room for NumDataNodes items + */ +void +DataNodeCopyBegin(RemoteCopyData *rcstate) +{ + int i; + List *nodelist = rcstate->rel_loc->nodeList; + PGXCNodeHandle **connections; + bool need_tran_block; + GlobalTransactionId gxid; + ResponseCombiner combiner; + Snapshot snapshot = GetActiveSnapshot(); + int conn_count = list_length(nodelist); + + /* Get needed datanode connections */ + if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType)) + { + /* Connections is a single handle to read from */ + connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *)); + connections[0] = get_any_handle(nodelist); + conn_count = 1; + } + else + { + PGXCNodeAllHandles *pgxc_handles; + pgxc_handles = get_handles(nodelist, NULL, false, true); + connections = pgxc_handles->datanode_handles; + Assert(pgxc_handles->dn_conn_count == conn_count); + pfree(pgxc_handles); + } + + /* + * If more than one nodes are involved or if we are already in a + * transaction block, we must the remote statements in a transaction block + */ + need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T'); + + elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count, + need_tran_block ? "true" : "false"); + + /* Gather statistics */ + stat_statement(); + stat_transaction(conn_count); + + gxid = GetCurrentTransactionId(); + + /* Start transaction on connections where it is not started */ + if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data nodes."))); + } + + /* + * COPY TO do not use locator, it just takes connections from it, and + * we do not look up distribution data type in this case. + * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not + * defined partType if real locator type is HASH or MODULO. + * Create locator before sending down query, because createLocator may + * fail and we leave with dirty connections. + * If we get an error now datanode connection will be clean and error + * handler will issue transaction abort. + */ + rcstate->locator = createLocator( + rcstate->is_from ? rcstate->rel_loc->locatorType + : LOCATOR_TYPE_RROBIN, + rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ, + rcstate->dist_type, + LOCATOR_LIST_POINTER, + conn_count, + (void *) connections, + NULL, + false); + + /* Send query to nodes */ + for (i = 0; i < conn_count; i++) + { + CHECK_OWNERSHIP(connections[i], NULL); + + if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot)) + { + add_error_message(connections[i], "Can not send request"); + pfree(connections); + freeLocator(rcstate->locator); + rcstate->locator = NULL; + return; + } + if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0) + { + add_error_message(connections[i], "Can not send request"); + pfree(connections); + freeLocator(rcstate->locator); + rcstate->locator = NULL; + return; + } + } + + /* + * We are expecting CopyIn response, but do not want to send it to client, + * caller should take care about this, because here we do not know if + * client runs console or file copy + */ + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + + /* Receive responses */ + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) + || !ValidateAndCloseCombiner(&combiner)) + { + DataNodeCopyFinish(conn_count, connections); + freeLocator(rcstate->locator); + rcstate->locator = NULL; + return; + } + pfree(connections); +} + + +/* + * Send a data row to the specified nodes + */ +int +DataNodeCopyIn(char *data_row, int len, int conn_count, PGXCNodeHandle** copy_connections) +{ + /* size + data row + \n */ + int msgLen = 4 + len + 1; + int nLen = htonl(msgLen); + int i; + + for(i = 0; i < conn_count; i++) + { + PGXCNodeHandle *handle = copy_connections[i]; + if (handle->state == DN_CONNECTION_STATE_COPY_IN) + { + /* precalculate to speed up access */ + int bytes_needed = handle->outEnd + 1 + msgLen; + + /* flush buffer if it is almost full */ + if (bytes_needed > COPY_BUFFER_SIZE) + { + int to_send = handle->outEnd; + + /* First look if data node has sent a error message */ + int read_status = pgxc_node_read_data(handle, true); + if (read_status == EOF || read_status < 0) + { + add_error_message(handle, "failed to read data from data node"); + return EOF; + } + + if (handle->inStart < handle->inEnd) + { + ResponseCombiner combiner; + InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + handle_response(handle, &combiner); + if (!ValidateAndCloseCombiner(&combiner)) + return EOF; + } + + if (DN_CONNECTION_STATE_ERROR(handle)) + return EOF; + + /* + * Try to send down buffered data if we have + */ + if (to_send && send_some(handle, to_send) < 0) + { + add_error_message(handle, "failed to send data to data node"); + return EOF; + } + } + + if (ensure_out_buffer_capacity(bytes_needed, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'd'; + memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); + handle->outEnd += 4; + memcpy(handle->outBuffer + handle->outEnd, data_row, len); + handle->outEnd += len; + handle->outBuffer[handle->outEnd++] = '\n'; + } + else + { + add_error_message(handle, "Invalid data node connection"); + return EOF; + } + } + return 0; +} + +uint64 +DataNodeCopyOut(PGXCNodeHandle** copy_connections, + int conn_count, FILE* copy_file) +{ + ResponseCombiner combiner; + uint64 processed; + bool error; + + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + combiner.processed = 0; + /* If there is an existing file where to copy data, pass it to combiner */ + if (copy_file) + { + combiner.copy_file = copy_file; + combiner.remoteCopyType = REMOTE_COPY_FILE; + } + else + { + combiner.copy_file = NULL; + combiner.remoteCopyType = REMOTE_COPY_STDOUT; + } + error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0); + + processed = combiner.processed; + + if (!ValidateAndCloseCombiner(&combiner) || error) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type))); + } + + return processed; +} + + +uint64 +DataNodeCopyStore(PGXCNodeHandle** copy_connections, + int conn_count, Tuplestorestate* store) +{ + ResponseCombiner combiner; + uint64 processed; + bool error; + + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + combiner.processed = 0; + combiner.remoteCopyType = REMOTE_COPY_TUPLESTORE; + combiner.tuplestorestate = store; + + error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0); + + processed = combiner.processed; + + if (!ValidateAndCloseCombiner(&combiner) || error) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type))); + } + + return processed; +} + + +/* + * Finish copy process on all connections + */ +void +DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections) +{ + int i; + ResponseCombiner combiner; + bool error = false; + for (i = 0; i < conn_count; i++) + { + PGXCNodeHandle *handle = connections[i]; + + error = true; + if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT) + error = DataNodeCopyEnd(handle, false); + } + + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + error = (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) != 0) || error; + + if (!validate_combiner(&combiner) || error) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Error while running COPY"))); + } + else + CloseCombiner(&combiner); +} + +/* + * End copy process on a connection + */ +bool +DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error) +{ + int nLen = htonl(4); + + if (handle == NULL) + return true; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0) + return true; + + if (is_error) + handle->outBuffer[handle->outEnd++] = 'f'; + else + handle->outBuffer[handle->outEnd++] = 'c'; + + memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); + handle->outEnd += 4; + + /* We need response right away, so send immediately */ + if (pgxc_node_flush(handle) < 0) + return true; + + return false; +} + + +/* + * Get Node connections depending on the connection type: + * Datanodes Only, Coordinators only or both types + */ +static PGXCNodeAllHandles * +get_exec_connections(RemoteQueryState *planstate, + ExecNodes *exec_nodes, + RemoteQueryExecType exec_type, + bool is_global_session) +{ + List *nodelist = NIL; + List *primarynode = NIL; + List *coordlist = NIL; + PGXCNodeHandle *primaryconnection; + int co_conn_count, dn_conn_count; + bool is_query_coord_only = false; + PGXCNodeAllHandles *pgxc_handles = NULL; + + /* + * If query is launched only on Coordinators, we have to inform get_handles + * not to ask for Datanode connections even if list of Datanodes is NIL. + */ + if (exec_type == EXEC_ON_COORDS) + is_query_coord_only = true; + + if (exec_type == EXEC_ON_CURRENT) + return get_current_handles(); + + if (exec_nodes) + { + if (exec_nodes->en_expr) + { + /* execution time determining of target Datanodes */ + bool isnull; + ExprState *estate = ExecInitExpr(exec_nodes->en_expr, + (PlanState *) planstate); + Datum partvalue = ExecEvalExpr(estate, + planstate->combiner.ss.ps.ps_ExprContext, + &isnull, + NULL); + RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid); + /* PGXCTODO what is the type of partvalue here */ + ExecNodes *nodes = GetRelationNodes(rel_loc_info, + partvalue, + isnull, + exec_nodes->accesstype); + /* + * en_expr is set by pgxc_set_en_expr only for distributed + * relations while planning DMLs, hence a select for update + * on a replicated table here is an assertion + */ + Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE && + IsRelationReplicated(rel_loc_info))); + + if (nodes) + { + nodelist = nodes->nodeList; + primarynode = nodes->primarynodelist; + pfree(nodes); + } + FreeRelationLocInfo(rel_loc_info); + } + else if (OidIsValid(exec_nodes->en_relid)) + { + RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid); + ExecNodes *nodes = GetRelationNodes(rel_loc_info, 0, true, exec_nodes->accesstype); + + /* + * en_relid is set only for DMLs, hence a select for update on a + * replicated table here is an assertion + */ + Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE && + IsRelationReplicated(rel_loc_info))); + + /* Use the obtained list for given table */ + if (nodes) + nodelist = nodes->nodeList; + + /* + * Special handling for ROUND ROBIN distributed tables. The target + * node must be determined at the execution time + */ + if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN && nodes) + { + nodelist = nodes->nodeList; + primarynode = nodes->primarynodelist; + } + else if (nodes) + { + if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) + { + nodelist = exec_nodes->nodeList; + primarynode = exec_nodes->primarynodelist; + } + } + + if (nodes) + pfree(nodes); + FreeRelationLocInfo(rel_loc_info); + } + else + { + if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) + nodelist = exec_nodes->nodeList; + else if (exec_type == EXEC_ON_COORDS) + coordlist = exec_nodes->nodeList; + + primarynode = exec_nodes->primarynodelist; + } + } + + /* Set node list and DN number */ + if (list_length(nodelist) == 0 && + (exec_type == EXEC_ON_ALL_NODES || + exec_type == EXEC_ON_DATANODES)) + { + /* Primary connection is included in this number of connections if it exists */ + dn_conn_count = NumDataNodes; + } + else + { + if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) + { + if (primarynode) + dn_conn_count = list_length(nodelist) + 1; + else + dn_conn_count = list_length(nodelist); + } + else + dn_conn_count = 0; + } + + /* Set Coordinator list and Coordinator number */ + if ((list_length(nodelist) == 0 && exec_type == EXEC_ON_ALL_NODES) || + (list_length(coordlist) == 0 && exec_type == EXEC_ON_COORDS)) + { + coordlist = GetAllCoordNodes(); + co_conn_count = list_length(coordlist); + } + else + { + if (exec_type == EXEC_ON_COORDS) + co_conn_count = list_length(coordlist); + else + co_conn_count = 0; + } + + /* Get other connections (non-primary) */ + pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session); + if (!pgxc_handles) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not obtain connection from pool"))); + + /* Get connection for primary node, if used */ + if (primarynode) + { + /* Let's assume primary connection is always a Datanode connection for the moment */ + PGXCNodeAllHandles *pgxc_conn_res; + pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session); + + /* primary connection is unique */ + primaryconnection = pgxc_conn_res->datanode_handles[0]; + + pfree(pgxc_conn_res); + + if (!primaryconnection) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not obtain connection from pool"))); + pgxc_handles->primary_handle = primaryconnection; + } + + /* Depending on the execution type, we still need to save the initial node counts */ + pgxc_handles->dn_conn_count = dn_conn_count; + pgxc_handles->co_conn_count = co_conn_count; + + return pgxc_handles; +} + + +static bool +pgxc_start_command_on_connection(PGXCNodeHandle *connection, + RemoteQueryState *remotestate, + Snapshot snapshot) +{ + CommandId cid; + ResponseCombiner *combiner = (ResponseCombiner *) remotestate; + RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan; + CHECK_OWNERSHIP(connection, combiner); + + /* + * Scan descriptor would be valid and would contain a valid snapshot + * in cases when we need to send out of order command id to data node + * e.g. in case of a fetch + */ + cid = GetCurrentCommandId(false); + + if (pgxc_node_send_cmd_id(connection, cid) < 0 ) + return false; + + if (snapshot && pgxc_node_send_snapshot(connection, snapshot)) + return false; + if (step->statement || step->cursor || step->remote_param_types) + { + /* need to use Extended Query Protocol */ + int fetch = 0; + bool prepared = false; + char nodetype = PGXC_NODE_DATANODE; + + /* if prepared statement is referenced see if it is already + * exist */ + if (step->statement) + prepared = + ActivateDatanodeStatementOnNode(step->statement, + PGXCNodeGetNodeId(connection->nodeoid, + &nodetype)); + + /* + * execute and fetch rows only if they will be consumed + * immediately by the sorter + */ + if (step->cursor) + fetch = 1; + + combiner->extended_query = true; + + if (pgxc_node_send_query_extended(connection, + prepared ? NULL : step->sql_statement, + step->statement, + step->cursor, + step->remote_num_params, + step->remote_param_types, + remotestate->paramval_len, + remotestate->paramval_data, + step->has_row_marks ? true : step->read_only, + fetch) != 0) + return false; + } + else + { + if (pgxc_node_send_query(connection, step->sql_statement) != 0) + return false; + } + return true; +} + +/* + * Encode parameter values to format of DataRow message (the same format is + * used in Bind) to prepare for sending down to Datanodes. + * The buffer to store encoded value is palloc'ed and returned as the result + * parameter. Function returns size of the result + */ +int +ParamListToDataRow(ParamListInfo params, char** result) +{ + StringInfoData buf; + uint16 n16; + int i; + int real_num_params = 0; + + /* + * It is necessary to fetch parameters + * before looking at the output value. + */ + for (i = 0; i < params->numParams; i++) + { + ParamExternData *param; + + param = ¶ms->params[i]; + + if (!OidIsValid(param->ptype) && params->paramFetch != NULL) + (*params->paramFetch) (params, i + 1); + + /* + * This is the last parameter found as useful, so we need + * to include all the previous ones to keep silent the remote + * nodes. All the parameters prior to the last usable having no + * type available will be considered as NULL entries. + */ + if (OidIsValid(param->ptype)) + real_num_params = i + 1; + } + + /* + * If there are no parameters available, simply leave. + * This is possible in the case of a query called through SPI + * and using no parameters. + */ + if (real_num_params == 0) + { + *result = NULL; + return 0; + } + + initStringInfo(&buf); + + /* Number of parameter values */ + n16 = htons(real_num_params); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + + /* Parameter values */ + for (i = 0; i < real_num_params; i++) + { + ParamExternData *param = ¶ms->params[i]; + uint32 n32; + + /* + * Parameters with no types are considered as NULL and treated as integer + * The same trick is used for dropped columns for remote DML generation. + */ + if (param->isnull || !OidIsValid(param->ptype)) + { + n32 = htonl(-1); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + } + else + { + Oid typOutput; + bool typIsVarlena; + Datum pval; + char *pstring; + int len; + + /* Get info needed to output the value */ + getTypeOutputInfo(param->ptype, &typOutput, &typIsVarlena); + + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (typIsVarlena) + pval = PointerGetDatum(PG_DETOAST_DATUM(param->value)); + else + pval = param->value; + + /* Convert Datum to string */ + pstring = OidOutputFunctionCall(typOutput, pval); + + /* copy data to the buffer */ + len = strlen(pstring); + n32 = htonl(len); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + appendBinaryStringInfo(&buf, pstring, len); + } + } + + /* Take data from the buffer */ + *result = palloc(buf.len); + memcpy(*result, buf.data, buf.len); + pfree(buf.data); + return buf.len; +} + +/* + * Execute utility statement on multiple Datanodes + * It does approximately the same as + * + * RemoteQueryState *state = ExecInitRemoteQuery(plan, estate, flags); + * Assert(TupIsNull(ExecRemoteQuery(state)); + * ExecEndRemoteQuery(state) + * + * But does not need an Estate instance and does not do some unnecessary work, + * like allocating tuple slots. + */ +void +ExecRemoteUtility(RemoteQuery *node) +{ + RemoteQueryState *remotestate; + ResponseCombiner *combiner; + bool force_autocommit = node->force_autocommit; + RemoteQueryExecType exec_type = node->exec_type; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + Snapshot snapshot = NULL; + PGXCNodeAllHandles *pgxc_connections; + int co_conn_count; + int dn_conn_count; + bool need_tran_block; + ExecDirectType exec_direct_type = node->exec_direct_type; + int i; + CommandId cid = GetCurrentCommandId(true); + + if (!force_autocommit) + RegisterTransactionLocalNode(true); + + remotestate = makeNode(RemoteQueryState); + combiner = (ResponseCombiner *)remotestate; + InitResponseCombiner(combiner, 0, node->combine_type); + + /* + * Do not set global_session if it is a utility statement. + * Avoids CREATE NODE error on cluster configuration. + */ + pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type, + exec_direct_type != EXEC_DIRECT_UTILITY); + + dn_conn_count = pgxc_connections->dn_conn_count; + co_conn_count = pgxc_connections->co_conn_count; + /* exit right away if no nodes to run command on */ + if (dn_conn_count == 0 && co_conn_count == 0) + { + pfree_pgxc_all_handles(pgxc_connections); + return; + } + + if (force_autocommit) + need_tran_block = false; + else + need_tran_block = true; + + /* Commands launched through EXECUTE DIRECT do not need start a transaction */ + if (exec_direct_type == EXEC_DIRECT_UTILITY) + { + need_tran_block = false; + + /* This check is not done when analyzing to limit dependencies */ + if (IsTransactionBlock()) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("cannot run EXECUTE DIRECT with utility inside a transaction block"))); + } + + gxid = GetCurrentTransactionId(); + if (ActiveSnapshotSet()) + snapshot = GetActiveSnapshot(); + if (!GlobalTransactionIdIsValid(gxid)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to get next transaction ID"))); + + { + if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles, + gxid, need_tran_block, false, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on Datanodes"))); + for (i = 0; i < dn_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; + + if (conn->state == DN_CONNECTION_STATE_QUERY) + BufferConnection(conn); + if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send snapshot to Datanodes"))); + } + if (pgxc_node_send_cmd_id(conn, cid) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to Datanodes"))); + } + + if (pgxc_node_send_query(conn, node->sql_statement) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to Datanodes"))); + } + } + } + + { + if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles, + gxid, need_tran_block, false, PGXC_NODE_COORDINATOR)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on coordinators"))); + /* Now send it to Coordinators if necessary */ + for (i = 0; i < co_conn_count; i++) + { + if (snapshot && pgxc_node_send_snapshot(pgxc_connections->coord_handles[i], snapshot)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to coordinators"))); + } + if (pgxc_node_send_cmd_id(pgxc_connections->coord_handles[i], cid) < 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to Datanodes"))); + } + + if (pgxc_node_send_query(pgxc_connections->coord_handles[i], node->sql_statement) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to coordinators"))); + } + } + } + + /* + * Stop if all commands are completed or we got a data row and + * initialized state node for subsequent invocations + */ + { + while (dn_conn_count > 0) + { + int i = 0; + + if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL)) + break; + /* + * Handle input from the Datanodes. + * We do not expect Datanodes returning tuples when running utility + * command. + * If we got EOF, move to the next connection, will receive more + * data on the next iteration. + */ + while (i < dn_conn_count) + { + PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; + int res = handle_response(conn, combiner); + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_COMPLETE) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) + { + if (i < --dn_conn_count) + pgxc_connections->datanode_handles[i] = + pgxc_connections->datanode_handles[dn_conn_count]; + } + else if (res == RESPONSE_TUPDESC) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from Datanode"))); + } + else if (res == RESPONSE_DATAROW) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from Datanode"))); + } + } + } + } + + /* Make the same for Coordinators */ + { + while (co_conn_count > 0) + { + int i = 0; + + if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL)) + break; + + while (i < co_conn_count) + { + int res = handle_response(pgxc_connections->coord_handles[i], combiner); + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_COMPLETE) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) + { + if (i < --co_conn_count) + pgxc_connections->coord_handles[i] = + pgxc_connections->coord_handles[co_conn_count]; + } + else if (res == RESPONSE_TUPDESC) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from coordinator"))); + } + else if (res == RESPONSE_DATAROW) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from coordinator"))); + } + } + } + } + + /* + * We have processed all responses from nodes and if we have + * error message pending we can report it. All connections should be in + * consistent state now and so they can be released to the pool after ROLLBACK. + */ + pfree_pgxc_all_handles(pgxc_connections); + pgxc_node_report_error(combiner); +} + + +/* + * Called when the backend is ending. + */ +void +PGXCNodeCleanAndRelease(int code, Datum arg) +{ + + /* Disconnect from Pooler, if any connection is still held Pooler close it */ + PoolManagerDisconnect(); + + /* Close connection with GTM */ + CloseGTM(); + + /* Dump collected statistics to the log */ + stat_log(); +} + +void +ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) +{ + PGXCNodeAllHandles *all_handles; + PGXCNodeHandle **connections; + ResponseCombiner combiner; + int conn_count; + int i; + + /* Exit if nodelist is empty */ + if (list_length(nodelist) == 0) + return; + + /* get needed Datanode connections */ + all_handles = get_handles(nodelist, NIL, false, true); + conn_count = all_handles->dn_conn_count; + connections = all_handles->datanode_handles; + + for (i = 0; i < conn_count; i++) + { + if (connections[i]->state == DN_CONNECTION_STATE_QUERY) + BufferConnection(connections[i]); + if (pgxc_node_send_close(connections[i], true, stmt_name) != 0) + { + /* + * statements are not affected by statement end, so consider + * unclosed statement on the Datanode as a fatal issue and + * force connection is discarded + */ + connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close Datanode statemrnt"))); + } + if (pgxc_node_send_sync(connections[i]) != 0) + { + connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close Datanode statement"))); + } + } + + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + + while (conn_count > 0) + { + if (pgxc_node_receive(conn_count, connections, NULL)) + { + for (i = 0; i <= conn_count; i++) + connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL; + + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close Datanode statement"))); + } + i = 0; + while (i < conn_count) + { + int res = handle_response(connections[i], &combiner); + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_READY || + connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + if (--conn_count > i) + connections[i] = connections[conn_count]; + } + } + } + + ValidateAndCloseCombiner(&combiner); + pfree_pgxc_all_handles(all_handles); +} + +/* + * DataNodeCopyInBinaryForAll + * + * In a COPY TO, send to all Datanodes PG_HEADER for a COPY TO in binary mode. + */ +int +DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count, + PGXCNodeHandle** connections) +{ + int i; + int msgLen = 4 + len + 1; + int nLen = htonl(msgLen); + + for (i = 0; i < conn_count; i++) + { + PGXCNodeHandle *handle = connections[i]; + if (handle->state == DN_CONNECTION_STATE_COPY_IN) + { + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'd'; + memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); + handle->outEnd += 4; + memcpy(handle->outBuffer + handle->outEnd, msg_buf, len); + handle->outEnd += len; + handle->outBuffer[handle->outEnd++] = '\n'; + } + else + { + add_error_message(handle, "Invalid Datanode connection"); + return EOF; + } + } + + return 0; +} + +/* + * Clear per transaction remote information + */ +void +AtEOXact_Remote(void) +{ + PGXCNodeResetParams(true); +} + +/* + * Invoked when local transaction is about to be committed. + * If nodestring is specified commit specified prepared transaction on remote + * nodes, otherwise commit remote nodes which are in transaction. + */ +void +PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode) +{ + struct rusage start_r; + struct timeval start_t; + + if (log_gtm_stats) + ResetUsageCommon(&start_r, &start_t); + + /* + * Made node connections persistent if we are committing transaction + * that touched temporary tables. We never drop that flag, so after some + * transaction has created a temp table the session's remote connections + * become persistent. + * We do not need to set that flag if transaction that has created a temp + * table finally aborts - remote connections are not holding temporary + * objects in this case. + */ + if (IS_PGXC_LOCAL_COORDINATOR && MyXactAccessedTempRel) + temp_object_included = true; + + + /* + * OK, everything went fine. At least one remote node is in PREPARED state + * and the transaction is successfully prepared on all the involved nodes. + * Now we are ready to commit the transaction. We need a new GXID to send + * down the remote nodes to execute the forthcoming COMMIT PREPARED + * command. So grab one from the GTM and track it. It will be closed along + * with the main transaction at the end. + */ + if (nodestring) + { + Assert(preparedLocalNode); + pgxc_node_remote_finish(prepareGID, true, nodestring, + GetAuxilliaryTransactionId(), + GetTopGlobalTransactionId()); + + } + else + pgxc_node_remote_commit(); + + if (log_gtm_stats) + ShowUsageCommon("PreCommit_Remote", &start_r, &start_t); +} + +/* + * Do abort processing for the transaction. We must abort the transaction on + * all the involved nodes. If a node has already prepared a transaction, we run + * ROLLBACK PREPARED command on the node. Otherwise, a simple ROLLBACK command + * is sufficient. + * + * We must guard against the case when a transaction is prepared succefully on + * all the nodes and some error occurs after we send a COMMIT PREPARED message + * to at lease one node. Such a transaction must not be aborted to preserve + * global consistency. We handle this case by recording the nodes involved in + * the transaction at the GTM and keep the transaction open at the GTM so that + * its reported as "in-progress" on all the nodes until resolved + */ +bool +PreAbort_Remote(void) +{ + /* + * We are about to abort current transaction, and there could be an + * unexpected error leaving the node connection in some state requiring + * clean up, like COPY or pending query results. + * If we are running copy we should send down CopyFail message and read + * all possible incoming messages, there could be copy rows (if running + * COPY TO) ErrorResponse, ReadyForQuery. + * If there are pending results (connection state is DN_CONNECTION_STATE_QUERY) + * we just need to read them in and discard, all necessary commands are + * already sent. The end of input could be CommandComplete or + * PortalSuspended, in either case subsequent ROLLBACK closes the portal. + */ + PGXCNodeAllHandles *all_handles; + PGXCNodeHandle *clean_nodes[NumCoords + NumDataNodes]; + int node_count = 0; + int cancel_dn_count = 0, cancel_co_count = 0; + int cancel_dn_list[NumDataNodes]; + int cancel_co_list[NumCoords]; + int i; + struct rusage start_r; + struct timeval start_t; + + if (log_gtm_stats) + ResetUsageCommon(&start_r, &start_t); + + all_handles = get_current_handles(); + /* + * Find "dirty" coordinator connections. + * COPY is never running on a coordinator connections, we just check for + * pending data. + */ + for (i = 0; i < all_handles->co_conn_count; i++) + { + PGXCNodeHandle *handle = all_handles->coord_handles[i]; + + if (handle->state == DN_CONNECTION_STATE_QUERY) + { + /* + * Forget previous combiner if any since input will be handled by + * different one. + */ + handle->combiner = NULL; + clean_nodes[node_count++] = handle; + cancel_co_list[cancel_co_count++] = i; + } + } + + /* + * The same for data nodes, but cancel COPY if it is running. + */ + for (i = 0; i < all_handles->dn_conn_count; i++) + { + PGXCNodeHandle *handle = all_handles->datanode_handles[i]; + + if (handle->state == DN_CONNECTION_STATE_QUERY) + { + /* + * Forget previous combiner if any since input will be handled by + * different one. + */ + handle->combiner = NULL; + clean_nodes[node_count++] = handle; + cancel_dn_list[cancel_dn_count++] = i; + } + else if (handle->state == DN_CONNECTION_STATE_COPY_IN || + handle->state == DN_CONNECTION_STATE_COPY_OUT) + { + DataNodeCopyEnd(handle, true); + /* + * Forget previous combiner if any since input will be handled by + * different one. + */ + handle->combiner = NULL; + clean_nodes[node_count++] = handle; + cancel_dn_list[cancel_dn_count++] = i; + } + } + + /* + * Cancel running queries on the datanodes and the coordinators. + */ + PoolManagerCancelQuery(cancel_dn_count, cancel_dn_list, cancel_co_count, + cancel_co_list); + + /* + * Now read and discard any data from the connections found "dirty" + */ + if (node_count > 0) + { + ResponseCombiner combiner; + + InitResponseCombiner(&combiner, node_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + combiner.connections = clean_nodes; + combiner.conn_count = node_count; + combiner.request_type = REQUEST_TYPE_ERROR; + + pgxc_connections_cleanup(&combiner); + + /* prevent pfree'ing local variable */ + combiner.connections = NULL; + + CloseCombiner(&combiner); + } + + pgxc_node_remote_abort(); + + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } + + pfree_pgxc_all_handles(all_handles); + + if (log_gtm_stats) + ShowUsageCommon("PreAbort_Remote", &start_r, &start_t); + + return true; +} + + +/* + * Invoked when local transaction is about to be prepared. + * If invoked on a Datanode just commit transaction on remote connections, + * since secondary sessions are read only and never need to be prepared. + * Otherwise run PREPARE on remote connections, where writable commands were + * sent (connections marked as not read-only). + * If that is explicit PREPARE (issued by client) notify GTM. + * In case of implicit PREPARE not involving local node (ex. caused by + * INSERT, UPDATE or DELETE) commit prepared transaction immediately. + * Return list of node names where transaction was actually prepared, include + * the name of the local node if localNode is true. + */ +char * +PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit) +{ + /* Always include local node if running explicit prepare */ + char *nodestring; + struct rusage start_r; + struct timeval start_t; + + if (log_gtm_stats) + ResetUsageCommon(&start_r, &start_t); + + /* + * Primary session is doing 2PC, just commit secondary processes and exit + */ + if (IS_PGXC_DATANODE) + { + pgxc_node_remote_commit(); + return NULL; + } + + nodestring = pgxc_node_remote_prepare(prepareGID, + !implicit || localNode); + + if (!implicit && IS_PGXC_LOCAL_COORDINATOR) + /* Save the node list and gid on GTM. */ + StartPreparedTranGTM(GetTopGlobalTransactionId(), prepareGID, + nodestring); + + /* + * If no need to commit on local node go ahead and commit prepared + * transaction right away. + */ + if (implicit && !localNode && nodestring) + { + pgxc_node_remote_finish(prepareGID, true, nodestring, + GetAuxilliaryTransactionId(), + GetTopGlobalTransactionId()); + pfree(nodestring); + nodestring = NULL; + } + + if (log_gtm_stats) + ShowUsageCommon("PrePrepare_Remote", &start_r, &start_t); + + return nodestring; +} + +/* + * Invoked immediately after local node is prepared. + * Notify GTM about completed prepare. + */ +void +PostPrepare_Remote(char *prepareGID, bool implicit) +{ + struct rusage start_r; + struct timeval start_t; + + if (log_gtm_stats) + ResetUsageCommon(&start_r, &start_t); + + if (!implicit) + PrepareTranGTM(GetTopGlobalTransactionId()); + + if (log_gtm_stats) + ShowUsageCommon("PostPrepare_Remote", &start_r, &start_t); +} + +/* + * Returns true if 2PC is required for consistent commit: if there was write + * activity on two or more nodes within current transaction. + */ +bool +IsTwoPhaseCommitRequired(bool localWrite) +{ + PGXCNodeAllHandles *handles; + bool found = localWrite; + int i; + + /* Never run 2PC on Datanode-to-Datanode connection */ + if (IS_PGXC_DATANODE) + return false; + + if (MyXactAccessedTempRel) + { + elog(DEBUG1, "Transaction accessed temporary objects - " + "2PC will not be used and that can lead to data inconsistencies " + "in case of failures"); + return false; + } + + handles = get_current_handles(); + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + if (conn->sock != NO_SOCKET && !conn->read_only && + conn->transaction_status == 'T') + { + if (found) + return true; /* second found */ + else + found = true; /* first found */ + } + } + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + if (conn->sock != NO_SOCKET && !conn->read_only && + conn->transaction_status == 'T') + { + if (found) + return true; /* second found */ + else + found = true; /* first found */ + } + } + return false; +} + +/* + * Execute COMMIT/ABORT PREPARED issued by the remote client on remote nodes. + * Contacts GTM for the list of involved nodes and for work complete + * notification. Returns true if prepared transaction on local node needs to be + * finished too. + */ +bool +FinishRemotePreparedTransaction(char *prepareGID, bool commit) +{ + char *nodestring; + GlobalTransactionId gxid, prepare_gxid; + bool prepared_local = false; + + /* + * Please note that with xc_maintenance_mode = on, COMMIT/ROLLBACK PREPARED will not + * propagate to remote nodes. Only GTM status is cleaned up. + */ + if (xc_maintenance_mode) + { + if (commit) + { + pgxc_node_remote_commit(); + CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL); + } + else + { + pgxc_node_remote_abort(); + RollbackTranGTM(prepare_gxid); + RollbackTranGTM(gxid); + } + return false; + } + + /* + * Get the list of nodes involved in this transaction. + * + * This function returns the GXID of the prepared transaction. It also + * returns a fresh GXID which can be used for running COMMIT PREPARED + * commands on the remote nodes. Both these GXIDs can then be either + * committed or aborted together. + * + * XXX While I understand that we get the prepared and a new GXID with a + * single call, it doesn't look nicer and create confusion. We should + * probably split them into two parts. This is used only for explicit 2PC + * which should not be very common in XC + */ + if (GetGIDDataGTM(prepareGID, &gxid, &prepare_gxid, &nodestring) < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("prepared transaction with identifier \"%s\" does not exist", + prepareGID))); + prepared_local = pgxc_node_remote_finish(prepareGID, commit, nodestring, + gxid, prepare_gxid); + + if (commit) + { + /* + * XXX For explicit 2PC, there will be enough delay for any + * waited-committed transactions to send a final COMMIT message to the + * GTM. + */ + CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL); + } + else + { + RollbackTranGTM(prepare_gxid); + RollbackTranGTM(gxid); + } + + return prepared_local; +} + + +/* + * Complete previously prepared transactions on remote nodes. + * Release remote connection after completion. + */ +static bool +pgxc_node_remote_finish(char *prepareGID, bool commit, + char *nodestring, GlobalTransactionId gxid, + GlobalTransactionId prepare_gxid) +{ + char finish_cmd[256]; + PGXCNodeHandle *connections[MaxCoords + MaxDataNodes]; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles; + bool prepared_local = false; + char *nodename; + List *nodelist = NIL; + List *coordlist = NIL; + int i; + /* + * Now based on the nodestring, run COMMIT/ROLLBACK PREPARED command on the + * remote nodes and also finish the transaction locally is required + */ + nodename = strtok(nodestring, ","); + while (nodename != NULL) + { + int nodeIndex; + char nodetype; + + /* Get node type and index */ + nodetype = PGXC_NODE_NONE; + nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype); + if (nodetype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("PGXC Node %s: object not defined", + nodename))); + + /* Check if node is requested is the self-node or not */ + if (nodetype == PGXC_NODE_COORDINATOR) + { + if (nodeIndex == PGXCNodeId - 1) + prepared_local = true; + else + coordlist = lappend_int(coordlist, nodeIndex); + } + else + nodelist = lappend_int(nodelist, nodeIndex); + + nodename = strtok(NULL, ","); + } + + if (nodelist == NIL && coordlist == NIL) + return prepared_local; + + pgxc_handles = get_handles(nodelist, coordlist, false, true); + + if (commit) + sprintf(finish_cmd, "COMMIT PREPARED '%s'", prepareGID); + else + sprintf(finish_cmd, "ROLLBACK PREPARED '%s'", prepareGID); + + for (i = 0; i < pgxc_handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_handles->datanode_handles[i]; + + if (pgxc_node_send_gxid(conn, gxid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send GXID for %s PREPARED command", + commit ? "COMMIT" : "ROLLBACK"))); + } + + if (pgxc_node_send_query(conn, finish_cmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send %s PREPARED command to the node %u", + commit ? "COMMIT" : "ROLLBACK", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + + for (i = 0; i < pgxc_handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_handles->coord_handles[i]; + + if (pgxc_node_send_gxid(conn, gxid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send GXID for %s PREPARED command", + commit ? "COMMIT" : "ROLLBACK"))); + } + + if (pgxc_node_send_query(conn, finish_cmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send %s PREPARED command to the node %u", + commit ? "COMMIT" : "ROLLBACK", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to COMMIT the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } + + pfree_pgxc_all_handles(pgxc_handles); + + return prepared_local; +} + +/***************************************************************************** + * + * Simplified versions of ExecInitRemoteQuery, ExecRemoteQuery and + * ExecEndRemoteQuery: in XCP they are only used to execute simple queries. + * + *****************************************************************************/ +RemoteQueryState * +ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) +{ + RemoteQueryState *remotestate; + ResponseCombiner *combiner; + + remotestate = makeNode(RemoteQueryState); + combiner = (ResponseCombiner *) remotestate; + InitResponseCombiner(combiner, 0, node->combine_type); + combiner->ss.ps.plan = (Plan *) node; + combiner->ss.ps.state = estate; + + combiner->ss.ps.qual = NIL; + + combiner->request_type = REQUEST_TYPE_QUERY; + + ExecInitResultTupleSlot(estate, &combiner->ss.ps); + ExecAssignResultTypeFromTL((PlanState *) remotestate); + + /* + * If there are parameters supplied, get them into a form to be sent to the + * datanodes with bind message. We should not have had done this before. + */ + if (estate->es_param_list_info) + { + Assert(!remotestate->paramval_data); + remotestate->paramval_len = ParamListToDataRow(estate->es_param_list_info, + &remotestate->paramval_data); + } + + /* We need expression context to evaluate */ + if (node->exec_nodes && node->exec_nodes->en_expr) + { + Expr *expr = node->exec_nodes->en_expr; + + if (IsA(expr, Var) && ((Var *) expr)->vartype == TIDOID) + { + /* Special case if expression does not need to be evaluated */ + } + else + { + /* prepare expression evaluation */ + ExecAssignExprContext(estate, &combiner->ss.ps); + } + } + + return remotestate; +} + + +/* + * Execute step of PGXC plan. + * The step specifies a command to be executed on specified nodes. + * On first invocation connections to the data nodes are initialized and + * command is executed. Further, as well as within subsequent invocations, + * responses are received until step is completed or there is a tuple to emit. + * If there is a tuple it is returned, otherwise returned NULL. The NULL result + * from the function indicates completed step. + * The function returns at most one tuple per invocation. + */ +TupleTableSlot * +ExecRemoteQuery(RemoteQueryState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan; + TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot; + + if (!node->query_Done) + { + GlobalTransactionId gxid = InvalidGlobalTransactionId; + Snapshot snapshot = GetActiveSnapshot(); + PGXCNodeHandle **connections = NULL; + PGXCNodeHandle *primaryconnection = NULL; + int i; + int regular_conn_count = 0; + int total_conn_count = 0; + bool need_tran_block; + PGXCNodeAllHandles *pgxc_connections; + + /* + * Get connections for Datanodes only, utilities and DDLs + * are launched in ExecRemoteUtility + */ + pgxc_connections = get_exec_connections(node, step->exec_nodes, + step->exec_type, + true); + + if (step->exec_type == EXEC_ON_DATANODES) + { + connections = pgxc_connections->datanode_handles; + total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count; + } + else if (step->exec_type == EXEC_ON_COORDS) + { + connections = pgxc_connections->coord_handles; + total_conn_count = regular_conn_count = pgxc_connections->co_conn_count; + } + + primaryconnection = pgxc_connections->primary_handle; + + /* + * Primary connection is counted separately but is included in total_conn_count if used. + */ + if (primaryconnection) + regular_conn_count--; + + /* + * We save only regular connections, at the time we exit the function + * we finish with the primary connection and deal only with regular + * connections on subsequent invocations + */ + combiner->node_count = regular_conn_count; + + /* + * Start transaction on data nodes if we are in explicit transaction + * or going to use extended query protocol or write to multiple nodes + */ + if (step->force_autocommit) + need_tran_block = false; + else + need_tran_block = step->cursor || + (!step->read_only && total_conn_count > 1) || + (TransactionBlockStatusCode() == 'T'); + + stat_statement(); + stat_transaction(total_conn_count); + + gxid = GetCurrentTransactionIdIfAny(); + /* See if we have a primary node, execute on it first before the others */ + if (primaryconnection) + { + if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block, + step->read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node."))); + + /* If explicit transaction is needed gxid is already sent */ + if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot)) + { + pgxc_node_remote_abort(); + pfree_pgxc_all_handles(pgxc_connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + Assert(combiner->combine_type == COMBINE_TYPE_SAME); + + pgxc_node_receive(1, &primaryconnection, NULL); + /* Make sure the command is completed on the primary node */ + while (true) + { + int res = handle_response(primaryconnection, combiner); + if (res == RESPONSE_READY) + break; + else if (res == RESPONSE_EOF) + pgxc_node_receive(1, &primaryconnection, NULL); + else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR) + /* Get ReadyForQuery */ + continue; + else if (res == RESPONSE_ASSIGN_GXID) + continue; + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from data node"))); + } + if (combiner->errorMessage) + pgxc_node_report_error(combiner); + } + + for (i = 0; i < regular_conn_count; i++) + { + if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block, + step->read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node."))); + + /* If explicit transaction is needed gxid is already sent */ + if (!pgxc_start_command_on_connection(connections[i], node, snapshot)) + { + pgxc_node_remote_abort(); + pfree_pgxc_all_handles(pgxc_connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + connections[i]->combiner = combiner; + } + + if (step->cursor) + { + combiner->cursor = step->cursor; + combiner->cursor_count = regular_conn_count; + combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *)); + memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *)); + } + + combiner->connections = connections; + combiner->conn_count = regular_conn_count; + combiner->current_conn = 0; + + if (combiner->cursor_count) + { + combiner->conn_count = combiner->cursor_count; + memcpy(connections, combiner->cursor_connections, + combiner->cursor_count * sizeof(PGXCNodeHandle *)); + combiner->connections = connections; + } + + node->query_Done = true; + + if (step->sort) + { + SimpleSort *sort = step->sort; + + /* + * First message is already in the buffer + * Further fetch will be under tuplesort control + * If query does not produce rows tuplesort will not + * be initialized + */ + combiner->tuplesortstate = tuplesort_begin_merge( + resultslot->tts_tupleDescriptor, + sort->numCols, + sort->sortColIdx, + sort->sortOperators, + sort->sortCollations, + sort->nullsFirst, + combiner, + work_mem); + } + } + + if (combiner->tuplesortstate) + { + if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate, + true, resultslot)) + return resultslot; + else + ExecClearTuple(resultslot); + } + else + { + TupleTableSlot *slot = FetchTuple(combiner); + if (!TupIsNull(slot)) + return slot; + } + + if (combiner->errorMessage) + pgxc_node_report_error(combiner); + + return NULL; +} + + +/* + * Clean up and discard any data on the data node connections that might not + * handled yet, including pending on the remote connection. + */ +static void +pgxc_connections_cleanup(ResponseCombiner *combiner) +{ + /* clean up the buffer */ + list_free_deep(combiner->rowBuffer); + combiner->rowBuffer = NIL; + + /* + * Read in and discard remaining data from the connections, if any + */ + combiner->current_conn = 0; + while (combiner->conn_count > 0) + { + int res; + PGXCNodeHandle *conn = combiner->connections[combiner->current_conn]; + + /* + * Possible if we are doing merge sort. + * We can do usual procedure and move connections around since we are + * cleaning up and do not care what connection at what position + */ + if (conn == NULL) + { + REMOVE_CURR_CONN(combiner); + continue; + } + + /* throw away current message that may be in the buffer */ + if (combiner->currentRow) + { + pfree(combiner->currentRow); + combiner->currentRow = NULL; + } + + /* no data is expected */ + if (conn->state == DN_CONNECTION_STATE_IDLE || + conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + REMOVE_CURR_CONN(combiner); + continue; + } + + /* + * Connection owner is different, so no our data pending at + * the connection, nothing to read in. + */ + if (conn->combiner && conn->combiner != combiner) + { + REMOVE_CURR_CONN(combiner); + continue; + } + + res = handle_response(conn, combiner); + if (res == RESPONSE_EOF) + { + struct timeval timeout; + timeout.tv_sec = END_QUERY_TIMEOUT / 1000; + timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000; + + if (pgxc_node_receive(1, &conn, &timeout)) + elog(LOG, "Failed to read response from data nodes when ending query"); + } + } + + /* + * Release tuplesort resources + */ + if (combiner->tuplesortstate) + { + /* + * Free these before tuplesort_end, because these arrays may appear + * in the tuplesort's memory context, tuplesort_end deletes this + * context and may invalidate the memory. + * We still want to free them here, because these may be in different + * context. + */ + if (combiner->tapenodes) + { + pfree(combiner->tapenodes); + combiner->tapenodes = NULL; + } + if (combiner->tapemarks) + { + pfree(combiner->tapemarks); + combiner->tapemarks = NULL; + } + /* + * tuplesort_end invalidates minimal tuple if it is in the slot because + * deletes the TupleSort memory context, causing seg fault later when + * releasing tuple table + */ + ExecClearTuple(combiner->ss.ps.ps_ResultTupleSlot); + tuplesort_end((Tuplesortstate *) combiner->tuplesortstate); + combiner->tuplesortstate = NULL; + } +} + + +/* + * End the remote query + */ +void +ExecEndRemoteQuery(RemoteQueryState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + + /* + * Clean up remote connections + */ + pgxc_connections_cleanup(combiner); + + /* + * Clean up parameters if they were set, since plan may be reused + */ + if (node->paramval_data) + { + pfree(node->paramval_data); + node->paramval_data = NULL; + node->paramval_len = 0; + } + + CloseCombiner(combiner); + pfree(node); +} + + +/********************************************** + * + * Routines to support RemoteSubplan plan node + * + **********************************************/ + + +/* + * The routine walks recursively over the plan tree and changes cursor names of + * RemoteSubplan nodes to make them different from launched from the other + * datanodes. The routine changes cursor names in place, so caller should + * take writable copy of the plan tree. + */ +void +RemoteSubplanMakeUnique(Node *plan, int unique) +{ + if (plan == NULL) + return; + + if (IsA(plan, List)) + { + ListCell *lc; + foreach(lc, (List *) plan) + { + RemoteSubplanMakeUnique(lfirst(lc), unique); + } + return; + } + + /* + * Transform SharedQueue name + */ + if (IsA(plan, RemoteSubplan)) + { + ((RemoteSubplan *)plan)->unique = unique; + } + /* Otherwise it is a Plan descendant */ + RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique); + RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique); + /* Tranform special cases */ + switch (nodeTag(plan)) + { + case T_Append: + RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans, + unique); + break; + case T_MergeAppend: + RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans, + unique); + break; + case T_BitmapAnd: + RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans, + unique); + break; + case T_BitmapOr: + RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans, + unique); + break; + case T_SubqueryScan: + RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan, + unique); + break; + default: + break; + } +} + +struct find_params_context +{ + RemoteParam *rparams; + Bitmapset *defineParams; +}; + +static bool +determine_param_types_walker(Node *node, struct find_params_context *context) +{ + if (node == NULL) + return false; + + if (IsA(node, Param)) + { + Param *param = (Param *) node; + int paramno = param->paramid; + + if (param->paramkind == PARAM_EXEC && + bms_is_member(paramno, context->defineParams)) + { + RemoteParam *cur = context->rparams; + while (cur->paramkind != PARAM_EXEC || cur->paramid != paramno) + cur++; + cur->paramtype = param->paramtype; + context->defineParams = bms_del_member(context->defineParams, + paramno); + return bms_is_empty(context->defineParams); + } + } + return expression_tree_walker(node, determine_param_types_walker, + (void *) context); + +} + +/* + * Scan expressions in the plan tree to find Param nodes and get data types + * from them + */ +static bool +determine_param_types(Plan *plan, struct find_params_context *context) +{ + Bitmapset *intersect; + + if (plan == NULL) + return false; + + intersect = bms_intersect(plan->allParam, context->defineParams); + if (bms_is_empty(intersect)) + { + /* the subplan does not depend on params we are interested in */ + bms_free(intersect); + return false; + } + bms_free(intersect); + + /* scan target list */ + if (expression_tree_walker((Node *) plan->targetlist, + determine_param_types_walker, + (void *) context)) + return true; + /* scan qual */ + if (expression_tree_walker((Node *) plan->qual, + determine_param_types_walker, + (void *) context)) + return true; + + /* Check additional node-type-specific fields */ + switch (nodeTag(plan)) + { + case T_Result: + if (expression_tree_walker((Node *) ((Result *) plan)->resconstantqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_SeqScan: ++ case T_SampleScan: + break; + + case T_IndexScan: + if (expression_tree_walker((Node *) ((IndexScan *) plan)->indexqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_IndexOnlyScan: + if (expression_tree_walker((Node *) ((IndexOnlyScan *) plan)->indexqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_BitmapIndexScan: + if (expression_tree_walker((Node *) ((BitmapIndexScan *) plan)->indexqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_BitmapHeapScan: + if (expression_tree_walker((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_TidScan: + if (expression_tree_walker((Node *) ((TidScan *) plan)->tidquals, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_SubqueryScan: + if (determine_param_types(((SubqueryScan *) plan)->subplan, context)) + return true; + break; + + case T_FunctionScan: + if (expression_tree_walker((Node *) ((FunctionScan *) plan)->functions, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_ValuesScan: + if (expression_tree_walker((Node *) ((ValuesScan *) plan)->values_lists, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_ModifyTable: + { + ListCell *l; + + foreach(l, ((ModifyTable *) plan)->plans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_RemoteSubplan: + break; + + case T_Append: + { + ListCell *l; + + foreach(l, ((Append *) plan)->appendplans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_BitmapAnd: + { + ListCell *l; + + foreach(l, ((BitmapAnd *) plan)->bitmapplans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_BitmapOr: + { + ListCell *l; + + foreach(l, ((BitmapOr *) plan)->bitmapplans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_NestLoop: + if (expression_tree_walker((Node *) ((Join *) plan)->joinqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_MergeJoin: + if (expression_tree_walker((Node *) ((Join *) plan)->joinqual, + determine_param_types_walker, + (void *) context)) + return true; + if (expression_tree_walker((Node *) ((MergeJoin *) plan)->mergeclauses, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_HashJoin: + if (expression_tree_walker((Node *) ((Join *) plan)->joinqual, + determine_param_types_walker, + (void *) context)) + return true; + if (expression_tree_walker((Node *) ((HashJoin *) plan)->hashclauses, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_Limit: + if (expression_tree_walker((Node *) ((Limit *) plan)->limitOffset, + determine_param_types_walker, + (void *) context)) + return true; + if (expression_tree_walker((Node *) ((Limit *) plan)->limitCount, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_RecursiveUnion: + break; + + case T_LockRows: + break; + + case T_WindowAgg: + if (expression_tree_walker((Node *) ((WindowAgg *) plan)->startOffset, + determine_param_types_walker, + (void *) context)) + if (expression_tree_walker((Node *) ((WindowAgg *) plan)->endOffset, + determine_param_types_walker, + (void *) context)) + break; + + case T_Hash: + case T_Agg: + case T_Material: + case T_Sort: + case T_Unique: + case T_SetOp: + case T_Group: + break; + + default: + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(plan)); + } + + + /* recurse into subplans */ + return determine_param_types(plan->lefttree, context) || + determine_param_types(plan->righttree, context); +} + + +RemoteSubplanState * +ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) +{ + RemoteStmt rstmt; + RemoteSubplanState *remotestate; + ResponseCombiner *combiner; + CombineType combineType; + struct rusage start_r; + struct timeval start_t; + + if (log_remotesubplan_stats) + ResetUsageCommon(&start_r, &start_t); + + remotestate = makeNode(RemoteSubplanState); + combiner = (ResponseCombiner *) remotestate; + /* + * We do not need to combine row counts if we will receive intermediate + * results or if we won't return row count. + */ + if (IS_PGXC_DATANODE || estate->es_plannedstmt->commandType == CMD_SELECT) + { + combineType = COMBINE_TYPE_NONE; + remotestate->execOnAll = node->execOnAll; + } + else + { + if (node->execOnAll) + combineType = COMBINE_TYPE_SUM; + else + combineType = COMBINE_TYPE_SAME; + /* + * If we are updating replicated table we should run plan on all nodes. + * We are choosing single node only to read + */ + remotestate->execOnAll = true; + } + remotestate->execNodes = list_copy(node->nodeList); + InitResponseCombiner(combiner, 0, combineType); + combiner->ss.ps.plan = (Plan *) node; + combiner->ss.ps.state = estate; + + combiner->ss.ps.qual = NIL; + + combiner->request_type = REQUEST_TYPE_QUERY; + + ExecInitResultTupleSlot(estate, &combiner->ss.ps); + ExecAssignResultTypeFromTL((PlanState *) remotestate); + + /* + * We optimize execution if we going to send down query to next level + */ + remotestate->local_exec = false; + if (IS_PGXC_DATANODE) + { + if (remotestate->execNodes == NIL) + { + /* + * Special case, if subplan is not distributed, like Result, or + * query against catalog tables only. + * We are only interested in filtering out the subplan results and + * get only those we are interested in. + * XXX we may want to prevent multiple executions in this case + * either, to achieve this we will set single execNode on planning + * time and this case would never happen, this code branch could + * be removed. + */ + remotestate->local_exec = true; + } + else if (!remotestate->execOnAll) + { + /* + * XXX We should change planner and remove this flag. + * We want only one node is producing the replicated result set, + * and planner should choose that node - it is too hard to determine + * right node at execution time, because it should be guaranteed + * that all consumers make the same decision. + * For now always execute replicated plan on local node to save + * resources. + */ + + /* + * Make sure local node is in execution list + */ + if (list_member_int(remotestate->execNodes, PGXCNodeId-1)) + { + list_free(remotestate->execNodes); + remotestate->execNodes = NIL; + remotestate->local_exec = true; + } + else + { + /* + * To support, we need to connect to some producer, so + * each producer should be prepared to serve rows for random + * number of consumers. It is hard, because new consumer may + * connect after producing is started, on the other hand, + * absence of expected consumer is a problem too. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Getting replicated results from remote node is not supported"))); + } + } + } + + /* + * If we are going to execute subplan locally or doing explain initialize + * the subplan. Otherwise have remote node doing that. + */ + if (remotestate->local_exec || (eflags & EXEC_FLAG_EXPLAIN_ONLY)) + { + outerPlanState(remotestate) = ExecInitNode(outerPlan(node), estate, + eflags); + if (node->distributionNodes) + { + Oid distributionType = InvalidOid; + TupleDesc typeInfo; + + typeInfo = combiner->ss.ps.ps_ResultTupleSlot->tts_tupleDescriptor; + if (node->distributionKey != InvalidAttrNumber) + { + Form_pg_attribute attr; + attr = typeInfo->attrs[node->distributionKey - 1]; + distributionType = attr->atttypid; + } + /* Set up locator */ + remotestate->locator = createLocator(node->distributionType, + RELATION_ACCESS_INSERT, + distributionType, + LOCATOR_LIST_LIST, + 0, + (void *) node->distributionNodes, + (void **) &remotestate->dest_nodes, + false); + } + else + remotestate->locator = NULL; + } + + /* + * Encode subplan if it will be sent to remote nodes + */ + if (remotestate->execNodes && !(eflags & EXEC_FLAG_EXPLAIN_ONLY)) + { + ParamListInfo ext_params; + /* Encode plan if we are going to execute it on other nodes */ + rstmt.type = T_RemoteStmt; + if (node->distributionType == LOCATOR_TYPE_NONE && IS_PGXC_DATANODE) + { + /* + * There are cases when planner can not determine distribution of a + * subplan, in particular it does not determine distribution of + * subquery nodes. Such subplans executed from current location + * (node) and combine all results, like from coordinator nodes. + * However, if there are multiple locations where distributed + * executor is running this node, and there are more of + * RemoteSubplan plan nodes in the subtree there will be a problem - + * Instances of the inner RemoteSubplan nodes will be using the same + * SharedQueue, causing error. To avoid this problem we should + * traverse the subtree and change SharedQueue name to make it + * unique. + */ + RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId); + } + rstmt.planTree = outerPlan(node); + /* + * If datanode launch further execution of a command it should tell + * it is a SELECT, otherwise secondary data nodes won't return tuples + * expecting there will be nothing to return. + */ + if (IsA(outerPlan(node), ModifyTable)) + { + rstmt.commandType = estate->es_plannedstmt->commandType; + rstmt.hasReturning = estate->es_plannedstmt->hasReturning; + rstmt.resultRelations = estate->es_plannedstmt->resultRelations; + } + else + { + rstmt.commandType = CMD_SELECT; + rstmt.hasReturning = false; + rstmt.resultRelations = NIL; + } + rstmt.rtable = estate->es_range_table; + rstmt.subplans = estate->es_plannedstmt->subplans; + rstmt.nParamExec = estate->es_plannedstmt->nParamExec; + ext_params = estate->es_param_list_info; + rstmt.nParamRemote = (ext_params ? ext_params->numParams : 0) + + bms_num_members(node->scan.plan.allParam); + if (rstmt.nParamRemote > 0) + { + Bitmapset *tmpset; + int i; + int paramno; + + /* Allocate enough space */ + rstmt.remoteparams = (RemoteParam *) palloc(rstmt.nParamRemote * + sizeof(RemoteParam)); + paramno = 0; + if (ext_params) + { + for (i = 0; i < ext_params->numParams; i++) + { + ParamExternData *param = &ext_params->params[i]; + /* + * If parameter type is not yet defined but can be defined + * do that + */ + if (!OidIsValid(param->ptype) && ext_params->paramFetch) + (*ext_params->paramFetch) (ext_params, i + 1); + /* + * If parameter type is still not defined assume it is + * unused + */ + if (!OidIsValid(param->ptype)) + continue; + + rstmt.remoteparams[paramno].paramkind = PARAM_EXTERN; + rstmt.remoteparams[paramno].paramid = i + 1; + rstmt.remoteparams[paramno].paramtype = param->ptype; + paramno++; + } + /* store actual number of parameters */ + rstmt.nParamRemote = paramno; + } + + if (!bms_is_empty(node->scan.plan.allParam)) + { + Bitmapset *defineParams = NULL; + tmpset = bms_copy(node->scan.plan.allParam); + while ((i = bms_first_member(tmpset)) >= 0) + { + ParamExecData *prmdata; + + prmdata = &(estate->es_param_exec_vals[i]); + rstmt.remoteparams[paramno].paramkind = PARAM_EXEC; + rstmt.remoteparams[paramno].paramid = i; + rstmt.remoteparams[paramno].paramtype = prmdata->ptype; + /* Will scan plan tree to find out data type of the param */ + if (prmdata->ptype == InvalidOid) + defineParams = bms_add_member(defineParams, i); + paramno++; + } + /* store actual number of parameters */ + rstmt.nParamRemote = paramno; + bms_free(tmpset); + if (!bms_is_empty(defineParams)) + { + struct find_params_context context; + bool all_found; + + context.rparams = rstmt.remoteparams; + context.defineParams = defineParams; + + all_found = determine_param_types(node->scan.plan.lefttree, + &context); + /* + * Remove not defined params from the list of remote params. + * If they are not referenced no need to send them down + */ + if (!all_found) + { + for (i = 0; i < rstmt.nParamRemote; i++) + { + if (rstmt.remoteparams[i].paramkind == PARAM_EXEC && + bms_is_member(rstmt.remoteparams[i].paramid, + context.defineParams)) + { + /* Copy last parameter inplace */ + rstmt.nParamRemote--; + if (i < rstmt.nParamRemote) + rstmt.remoteparams[i] = + rstmt.remoteparams[rstmt.nParamRemote]; + /* keep current in the same position */ + i--; + } + } + } + bms_free(context.defineParams); + } + } + remotestate->nParamRemote = rstmt.nParamRemote; + remotestate->remoteparams = rstmt.remoteparams; + } + else + rstmt.remoteparams = NULL; + rstmt.rowMarks = estate->es_plannedstmt->rowMarks; + rstmt.distributionKey = node->distributionKey; + rstmt.distributionType = node->distributionType; + rstmt.distributionNodes = node->distributionNodes; + rstmt.distributionRestrict = node->distributionRestrict; + + set_portable_output(true); + remotestate->subplanstr = nodeToString(&rstmt); + set_portable_output(false); + + /* + * Connect to remote nodes and send down subplan + */ + if (!(eflags & EXEC_FLAG_SUBPLAN)) + ExecFinishInitRemoteSubplan(remotestate); + } + remotestate->bound = false; + /* + * It does not makes sense to merge sort if there is only one tuple source. + * By the contract it is already sorted + */ + if (node->sort && remotestate->execOnAll && + list_length(remotestate->execNodes) > 1) + combiner->merge_sort = true; + + if (log_remotesubplan_stats) + ShowUsageCommon("ExecInitRemoteSubplan", &start_r, &start_t); + + return remotestate; +} + + +void +ExecFinishInitRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + EState *estate = combiner->ss.ps.state; + Oid *paramtypes = NULL; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + Snapshot snapshot; + TimestampTz timestamp; + int i; + bool is_read_only; + char cursor[NAMEDATALEN]; + + /* + * Name is required to store plan as a statement + */ + Assert(plan->cursor); + + if (plan->unique) + snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + else + strncpy(cursor, plan->cursor, NAMEDATALEN); + + /* If it is alreaty fully initialized nothing to do */ + if (combiner->connections) + return; + + /* local only or explain only execution */ + if (node->subplanstr == NULL) + return; + + /* + * Check if any results are planned to be received here. + * Otherwise it does not make sense to send out the subplan. + */ + if (IS_PGXC_DATANODE && plan->distributionRestrict && + !list_member_int(plan->distributionRestrict, PGXCNodeId - 1)) + return; + + /* + * Acquire connections and send down subplan where it will be stored + * as a prepared statement. + * That does not require transaction id or snapshot, so does not send them + * here, postpone till bind. + */ + if (node->execOnAll) + { + PGXCNodeAllHandles *pgxc_connections; + pgxc_connections = get_handles(node->execNodes, NIL, false, true); + combiner->conn_count = pgxc_connections->dn_conn_count; + combiner->connections = pgxc_connections->datanode_handles; + combiner->current_conn = 0; + pfree(pgxc_connections); + } + else + { + combiner->connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *)); + combiner->connections[0] = get_any_handle(node->execNodes); + combiner->conn_count = 1; + combiner->current_conn = 0; + } + + gxid = GetCurrentTransactionIdIfAny(); + + /* extract parameter data types */ + if (node->nParamRemote > 0) + { + paramtypes = (Oid *) palloc(node->nParamRemote * sizeof(Oid)); + for (i = 0; i < node->nParamRemote; i++) + paramtypes[i] = node->remoteparams[i].paramtype; + } + /* send down subplan */ + snapshot = GetActiveSnapshot(); + timestamp = GetCurrentGTMStartTimestamp(); + /* + * Datanode should not send down statements that may modify + * the database. Potgres assumes that all sessions under the same + * postmaster have different xids. That may cause a locking problem. + * Shared locks acquired for reading still work fine. + */ + is_read_only = IS_PGXC_DATANODE || + !IsA(outerPlan(plan), ModifyTable); + + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *connection = combiner->connections[i]; + + if (pgxc_node_begin(1, &connection, gxid, true, + is_read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node."))); + + if (pgxc_node_send_timestamp(connection, timestamp)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (snapshot && pgxc_node_send_snapshot(connection, snapshot)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send snapshot to data nodes"))); + } + if (pgxc_node_send_cmd_id(connection, estate->es_snapshot->curcid) < 0 ) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to data nodes"))); + } + pgxc_node_send_plan(connection, cursor, "Remote Subplan", + node->subplanstr, node->nParamRemote, paramtypes); + if (pgxc_node_flush(connection)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send subplan to data nodes"))); + } + } +} + + +static void +append_param_data(StringInfo buf, Oid ptype, Datum value, bool isnull) +{ + uint32 n32; + + if (isnull) + { + n32 = htonl(-1); + appendBinaryStringInfo(buf, (char *) &n32, 4); + } + else + { + Oid typOutput; + bool typIsVarlena; + Datum pval; + char *pstring; + int len; + + /* Get info needed to output the value */ + getTypeOutputInfo(ptype, &typOutput, &typIsVarlena); + + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (typIsVarlena) + pval = PointerGetDatum(PG_DETOAST_DATUM(value)); + else + pval = value; + + /* Convert Datum to string */ + pstring = OidOutputFunctionCall(typOutput, pval); + + /* copy data to the buffer */ + len = strlen(pstring); + n32 = htonl(len); + appendBinaryStringInfo(buf, (char *) &n32, 4); + appendBinaryStringInfo(buf, pstring, len); + } +} + + +static int encode_parameters(int nparams, RemoteParam *remoteparams, + PlanState *planstate, char** result) +{ + EState *estate = planstate->state; + StringInfoData buf; + uint16 n16; + int i; + ExprContext *econtext; + MemoryContext oldcontext; + + if (planstate->ps_ExprContext == NULL) + ExecAssignExprContext(estate, planstate); + + econtext = planstate->ps_ExprContext; + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + initStringInfo(&buf); + + /* Number of parameter values */ + n16 = htons(nparams); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + + /* Parameter values */ + for (i = 0; i < nparams; i++) + { + RemoteParam *rparam = &remoteparams[i]; + int ptype = rparam->paramtype; + if (rparam->paramkind == PARAM_EXTERN) + { + ParamExternData *param; + param = &(estate->es_param_list_info->params[rparam->paramid - 1]); + append_param_data(&buf, ptype, param->value, param->isnull); + } + else + { + ParamExecData *param; + param = &(estate->es_param_exec_vals[rparam->paramid]); + if (param->execPlan) + { + /* Parameter not evaluated yet, so go do it */ + ExecSetParamPlan((SubPlanState *) param->execPlan, + planstate->ps_ExprContext); + /* ExecSetParamPlan should have processed this param... */ + Assert(param->execPlan == NULL); + } + append_param_data(&buf, ptype, param->value, param->isnull); + } + } + + /* Take data from the buffer */ + *result = palloc(buf.len); + memcpy(*result, buf.data, buf.len); + MemoryContextSwitchTo(oldcontext); + return buf.len; +} + + +TupleTableSlot * +ExecRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + EState *estate = combiner->ss.ps.state; + TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot; + struct rusage start_r; + struct timeval start_t; + + /* + * We allow combiner->conn_count == 0 after node initialization + * if we figured out that current node won't receive any result + * because of distributionRestrict is set by planner. + * But we should distinguish this case from others, when conn_count is 0. + * That is possible if local execution is chosen or data are buffered + * at the coordinator or data are exhausted and node was reset. + * in last two cases connections are saved to cursor_connections and we + * can check their presence. + */ + if (!node->local_exec && combiner->conn_count == 0 && + combiner->cursor_count == 0) + return NULL; + + if (log_remotesubplan_stats) + ResetUsageCommon(&start_r, &start_t); + +primary_mode_phase_two: + if (!node->bound) + { + int fetch = 0; + int paramlen = 0; + char *paramdata = NULL; + /* + * Conditions when we want to execute query on the primary node first: + * Coordinator running replicated ModifyTable on multiple nodes + */ + bool primary_mode = combiner->probing_primary || + (IS_PGXC_COORDINATOR && + combiner->combine_type == COMBINE_TYPE_SAME && + OidIsValid(primary_data_node) && + combiner->conn_count > 1); + char cursor[NAMEDATALEN]; + + if (plan->cursor) + { + fetch = 1000; + if (plan->unique) + snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + else + strncpy(cursor, plan->cursor, NAMEDATALEN); + } + else + cursor[0] = '\0'; + + /* + * Send down all available parameters, if any is used by the plan + */ + if (estate->es_param_list_info || + !bms_is_empty(plan->scan.plan.allParam)) + paramlen = encode_parameters(node->nParamRemote, + node->remoteparams, + &combiner->ss.ps, + ¶mdata); + + /* + * The subplan being rescanned, need to restore connections and + * re-bind the portal + */ + if (combiner->cursor) + { + int i; + + /* + * On second phase of primary mode connections are properly set, + * so do not copy. + */ + if (!combiner->probing_primary) + { + combiner->conn_count = combiner->cursor_count; + memcpy(combiner->connections, combiner->cursor_connections, + combiner->cursor_count * sizeof(PGXCNodeHandle *)); + } + + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *conn = combiner->connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + /* close previous cursor only on phase 1 */ + if (!primary_mode || !combiner->probing_primary) + pgxc_node_send_close(conn, false, combiner->cursor); + + /* + * If we now should probe primary, skip execution on non-primary + * nodes + */ + if (primary_mode && !combiner->probing_primary && + conn->nodeoid != primary_data_node) + continue; + + /* rebind */ + pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor, + paramlen, paramdata); + /* execute */ + pgxc_node_send_execute(conn, combiner->cursor, fetch); + /* submit */ + if (pgxc_node_send_flush(conn)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + + /* + * There could be only one primary node, but can not leave the + * loop now, because we need to close cursors. + */ + if (primary_mode && !combiner->probing_primary) + { + combiner->current_conn = i; + } + } + } + else if (node->execNodes) + { + CommandId cid; + int i; + + /* + * There are prepared statement, connections should be already here + */ + Assert(combiner->conn_count > 0); + + combiner->extended_query = true; + cid = estate->es_snapshot->curcid; + + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *conn = combiner->connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + /* + * If we now should probe primary, skip execution on non-primary + * nodes + */ + if (primary_mode && !combiner->probing_primary && + conn->nodeoid != primary_data_node) + continue; + + /* + * Update Command Id. Other command may be executed after we + * prepare and advanced Command Id. We should use one that + * was active at the moment when command started. + */ + if (pgxc_node_send_cmd_id(conn, cid)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command ID to data nodes"))); + } + + /* + * Resend the snapshot as well since the connection may have + * been buffered and use by other commands, with different + * snapshot. Set the snapshot back to what it was + */ + if (pgxc_node_send_snapshot(conn, estate->es_snapshot)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send snapshot to data nodes"))); + } + + /* bind */ + pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata); + /* execute */ + pgxc_node_send_execute(conn, cursor, fetch); + /* submit */ + if (pgxc_node_send_flush(conn)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + + /* + * There could be only one primary node, so if we executed + * subquery on the phase one of primary mode we can leave the + * loop now. + */ + if (primary_mode && !combiner->probing_primary) + { + combiner->current_conn = i; + break; + } + } + + /* + * On second phase of primary mode connections are backed up + * already, so do not copy. + */ + if (primary_mode) + { + if (combiner->probing_primary) + { + combiner->cursor = pstrdup(cursor); + } + else + { + combiner->cursor_count = combiner->conn_count; + combiner->cursor_connections = (PGXCNodeHandle **) palloc( + combiner->conn_count * sizeof(PGXCNodeHandle *)); + memcpy(combiner->cursor_connections, combiner->connections, + combiner->conn_count * sizeof(PGXCNodeHandle *)); + } + } + else + { + combiner->cursor = pstrdup(cursor); + combiner->cursor_count = combiner->conn_count; + combiner->cursor_connections = (PGXCNodeHandle **) palloc( + combiner->conn_count * sizeof(PGXCNodeHandle *)); + memcpy(combiner->cursor_connections, combiner->connections, + combiner->conn_count * sizeof(PGXCNodeHandle *)); + } + } + + if (combiner->merge_sort) + { + /* + * Requests are already made and sorter can fetch tuples to populate + * sort buffer. + */ + combiner->tuplesortstate = tuplesort_begin_merge( + resultslot->tts_tupleDescriptor, + plan->sort->numCols, + plan->sort->sortColIdx, + plan->sort->sortOperators, + plan->sort->sortCollations, + plan->sort->nullsFirst, + combiner, + work_mem); + } + if (primary_mode) + { + if (combiner->probing_primary) + { + combiner->probing_primary = false; + node->bound = true; + } + else + combiner->probing_primary = true; + } + else + node->bound = true; + } + + if (combiner->tuplesortstate) + { + if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate, + true, resultslot)) + { + if (log_remotesubplan_stats) + ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t); + return resultslot; + } + } + else + { + TupleTableSlot *slot = FetchTuple(combiner); + if (!TupIsNull(slot)) + { + if (log_remotesubplan_stats) + ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t); + return slot; + } + else if (combiner->probing_primary) + /* phase1 is successfully completed, run on other nodes */ + goto primary_mode_phase_two; + } + if (combiner->errorMessage) + pgxc_node_report_error(combiner); + + if (log_remotesubplan_stats) + ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t); + + return NULL; +} + + +void +ExecReScanRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *)node; + + /* + * If we haven't queried remote nodes yet, just return. If outerplan' + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else - no reason to re-scan it at all. + */ + if (!node->bound) + return; + + /* + * If we execute locally rescan local copy of the plan + */ + if (outerPlanState(node)) + ExecReScan(outerPlanState(node)); + + /* + * Consume any possible pending input + */ + pgxc_connections_cleanup(combiner); + + /* misc cleanup */ + combiner->command_complete_count = 0; + combiner->description_count = 0; + + /* + * Force query is re-bound with new parameters + */ + node->bound = false; +} + + +void +ExecEndRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *)node; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + int i; + struct rusage start_r; + struct timeval start_t; + + if (log_remotesubplan_stats) + ResetUsageCommon(&start_r, &start_t); + + if (outerPlanState(node)) + ExecEndNode(outerPlanState(node)); + if (node->locator) + freeLocator(node->locator); + + /* + * Consume any possible pending input + */ + if (node->bound) + pgxc_connections_cleanup(combiner); + + /* + * Update coordinator statistics + */ + if (IS_PGXC_COORDINATOR) + { + EState *estate = combiner->ss.ps.state; + + if (estate->es_num_result_relations > 0 && estate->es_processed > 0) + { + switch (estate->es_plannedstmt->commandType) + { + case CMD_INSERT: + /* One statement can insert into only one relation */ + pgstat_count_remote_insert( + estate->es_result_relations[0].ri_RelationDesc, + estate->es_processed); + break; + case CMD_UPDATE: + case CMD_DELETE: + { + /* + * We can not determine here how many row were updated + * or delete in each table, so assume same number of + * affected row in each table. + * If resulting number of rows is 0 because of rounding, + * increment each counter at least on 1. + */ + int i; + int n; + bool update; + + update = (estate->es_plannedstmt->commandType == CMD_UPDATE); + n = estate->es_processed / estate->es_num_result_relations; + if (n == 0) + n = 1; + for (i = 0; i < estate->es_num_result_relations; i++) + { + Relation r; + r = estate->es_result_relations[i].ri_RelationDesc; + if (update) + pgstat_count_remote_update(r, n); + else + pgstat_count_remote_delete(r, n); + } + } + break; + default: + /* nothing to count */ + break; + } + } + } + + /* + * Close portals. While cursors_connections exist there are open portals + */ + if (combiner->cursor) + { + /* Restore connections where there are active statements */ + combiner->conn_count = combiner->cursor_count; + memcpy(combiner->connections, combiner->cursor_connections, + combiner->cursor_count * sizeof(PGXCNodeHandle *)); + for (i = 0; i < combiner->cursor_count; i++) + { + PGXCNodeHandle *conn; + + conn = combiner->cursor_connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + if (pgxc_node_send_close(conn, false, combiner->cursor) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close data node cursor"))); + } + /* The cursor stuff is not needed */ + combiner->cursor = NULL; + combiner->cursor_count = 0; + pfree(combiner->cursor_connections); + combiner->cursor_connections = NULL; + } + + /* Close statements, even if they never were bound */ + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *conn; + char cursor[NAMEDATALEN]; + + if (plan->cursor) + { + if (plan->unique) + snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + else + strncpy(cursor, plan->cursor, NAMEDATALEN); + } + else + cursor[0] = '\0'; + + conn = combiner->connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + if (pgxc_node_send_close(conn, true, cursor) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close data node statement"))); + /* Send SYNC and wait for ReadyForQuery */ + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to synchronize data node"))); + /* + * Formally connection is not in QUERY state, we set the state to read + * CloseDone and ReadyForQuery responses. Upon receiving ReadyForQuery + * state will be changed back to IDLE and conn->coordinator will be + * cleared. + */ + conn->state = DN_CONNECTION_STATE_CLOSE; + } + + while (combiner->conn_count > 0) + { + if (pgxc_node_receive(combiner->conn_count, + combiner->connections, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close remote subplan"))); + i = 0; + while (i < combiner->conn_count) + { + int res = handle_response(combiner->connections[i], combiner); + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_READY) + { + /* Done, connection is reade for query */ + if (--combiner->conn_count > i) + combiner->connections[i] = + combiner->connections[combiner->conn_count]; + } + else if (res == RESPONSE_DATAROW) + { + /* + * If we are finishing slowly running remote subplan while it + * is still working (because of Limit, for example) it may + * produce one or more tuples between connection cleanup and + * handling Close command. One tuple does not cause any problem, + * but if it will not be read the next tuple will trigger + * assertion failure. So if we got a tuple, just read and + * discard it here. + */ + pfree(combiner->currentRow); + combiner->currentRow = NULL; + } + /* Ignore other possible responses */ + } + } + + ValidateAndCloseCombiner(combiner); + pfree(node); + + if (log_remotesubplan_stats) + ShowUsageCommon("ExecEndRemoteSubplan", &start_r, &start_t); +} + +/* + * pgxc_node_report_error + * Throw error from Datanode if any. + */ +static void +pgxc_node_report_error(ResponseCombiner *combiner) +{ + /* If no combiner, nothing to do */ + if (!combiner) + return; + if (combiner->errorMessage) + { + char *code = combiner->errorCode; + if ((combiner->errorDetail == NULL) && (combiner->errorHint == NULL)) + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage))); + else if ((combiner->errorDetail != NULL) && (combiner->errorHint != NULL)) + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage), + errdetail("%s", combiner->errorDetail), + errhint("%s", combiner->errorHint))); + else if (combiner->errorDetail != NULL) + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage), + errdetail("%s", combiner->errorDetail))); + else + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage), + errhint("%s", combiner->errorHint))); + } +} + + +/* + * get_success_nodes: + * Currently called to print a user-friendly message about + * which nodes the query failed. + * Gets all the nodes where no 'E' (error) messages were received; i.e. where the + * query ran successfully. + */ +static ExecNodes * +get_success_nodes(int node_count, PGXCNodeHandle **handles, char node_type, StringInfo failednodes) +{ + ExecNodes *success_nodes = NULL; + int i; + + for (i = 0; i < node_count; i++) + { + PGXCNodeHandle *handle = handles[i]; + int nodenum = PGXCNodeGetNodeId(handle->nodeoid, &node_type); + + if (!handle->error) + { + if (!success_nodes) + success_nodes = makeNode(ExecNodes); + success_nodes->nodeList = lappend_int(success_nodes->nodeList, nodenum); + } + else + { + if (failednodes->len == 0) + appendStringInfo(failednodes, "Error message received from nodes:"); + appendStringInfo(failednodes, " %s#%d", + (node_type == PGXC_NODE_COORDINATOR ? "coordinator" : "datanode"), + nodenum + 1); + } + } + return success_nodes; +} + +/* + * pgxc_all_success_nodes: Uses get_success_nodes() to collect the + * user-friendly message from coordinator as well as datanode. + */ +void +pgxc_all_success_nodes(ExecNodes **d_nodes, ExecNodes **c_nodes, char **failednodes_msg) +{ + PGXCNodeAllHandles *connections = get_exec_connections(NULL, NULL, EXEC_ON_ALL_NODES, true); + StringInfoData failednodes; + initStringInfo(&failednodes); + + *d_nodes = get_success_nodes(connections->dn_conn_count, + connections->datanode_handles, + PGXC_NODE_DATANODE, + &failednodes); + + *c_nodes = get_success_nodes(connections->co_conn_count, + connections->coord_handles, + PGXC_NODE_COORDINATOR, + &failednodes); + + if (failednodes.len == 0) + *failednodes_msg = NULL; + else + *failednodes_msg = failednodes.data; + + pfree_pgxc_all_handles(connections); +} + + +/* + * set_dbcleanup_callback: + * Register a callback function which does some non-critical cleanup tasks + * on xact success or abort, such as tablespace/database directory cleanup. + */ +void set_dbcleanup_callback(xact_callback function, void *paraminfo, int paraminfo_size) +{ + void *fparams; + + fparams = MemoryContextAlloc(TopMemoryContext, paraminfo_size); + memcpy(fparams, paraminfo, paraminfo_size); + + dbcleanup_info.function = function; + dbcleanup_info.fparams = fparams; +} + +/* + * AtEOXact_DBCleanup: To be called at post-commit or pre-abort. + * Calls the cleanup function registered during this transaction, if any. + */ +void AtEOXact_DBCleanup(bool isCommit) +{ + if (dbcleanup_info.function) + (*dbcleanup_info.function)(isCommit, dbcleanup_info.fparams); + + /* + * Just reset the callbackinfo. We anyway don't want this to be called again, + * until explicitly set. + */ + dbcleanup_info.function = NULL; + if (dbcleanup_info.fparams) + { + pfree(dbcleanup_info.fparams); + dbcleanup_info.fparams = NULL; + } +} diff --cc src/backend/postmaster/postmaster.c index 86ebaac5f0,aab2f4ca70..5214e459ba --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@@ -1351,19 -1283,9 +1388,20 @@@ PostmasterMain(int argc, char *argv[] */ StartupPID = StartupDataBase(); Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; pmState = PM_STARTUP; +#ifdef PGXC /* PGXC_COORD */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Initialize the Data Node connection pool + */ + PgPoolerPID = StartPoolManager(); + + MemoryContextSwitchTo(oldcontext); +#endif /* PGXC */ + /* Some workers may be scheduled to start now */ maybe_start_bgworker(); @@@ -1780,21 -1739,9 +1855,21 @@@ ServerLoop(void if (PgStatPID == 0 && pmState == PM_RUN) PgStatPID = pgstat_start(); +#ifdef PGXC + /* If we have lost the pooler, try to start a new one */ + if (PgPoolerPID == 0 && pmState == PM_RUN) + PgPoolerPID = StartPoolManager(); +#endif /* PGXC */ + +#ifdef XCP + /* If we have lost the cluster monitor, try to start a new one */ + if (ClusterMonPID == 0 && pmState == PM_RUN) + ClusterMonPID = StartClusterMonitor(); +#endif + /* If we have lost the archiver, try to start a new one. */ if (PgArchPID == 0 && PgArchStartupAllowed()) - PgArchPID = pgarch_start(); + PgArchPID = pgarch_start(); /* If we need to signal the autovacuum launcher, do so now */ if (avlauncher_needs_signal) @@@ -2486,17 -2462,8 +2590,16 @@@ SIGHUP_handler(SIGNAL_ARGS (errmsg("received SIGHUP, reloading configuration files"))); ProcessConfigFile(PGC_SIGHUP); SignalChildren(SIGHUP); - SignalUnconnectedWorkers(SIGHUP); if (StartupPID != 0) signal_child(StartupPID, SIGHUP); +#ifdef PGXC /* PGXC_COORD */ + if (PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGHUP); +#endif /* PGXC */ +#ifdef XCP + if (ClusterMonPID != 0) + signal_child(ClusterMonPID, SIGHUP); +#endif if (BgWriterPID != 0) signal_child(BgWriterPID, SIGHUP); if (CheckpointerPID != 0) @@@ -2631,19 -2589,11 +2733,19 @@@ pmdie(SIGNAL_ARGS signal_child(BgWriterPID, SIGTERM); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGTERM); +#ifdef XCP + /* and the pool manager too */ + if (PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGTERM); + /* and the cluster monitor too */ + if (ClusterMonPID != 0) + signal_child(ClusterMonPID, SIGTERM); +#endif /* XCP */ - SignalUnconnectedWorkers(SIGTERM); if (pmState == PM_RECOVERY) { + SignalSomeChildren(SIGTERM, BACKEND_TYPE_BGWORKER); /* - * Only startup, bgwriter, walreceiver, unconnected bgworkers, + * Only startup, bgwriter, walreceiver, possibly bgworkers, * and/or checkpointer should be active in this state; we just * signaled the first four, and we don't want to kill * checkpointer yet. @@@ -3657,14 -3544,7 +3761,13 @@@ PostmasterStateMachine(void * process. */ if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_WORKER) == 0 && - CountUnconnectedWorkers() == 0 && StartupPID == 0 && +#ifdef PGXC + PgPoolerPID == 0 && +#endif +#ifdef XCP + ClusterMonPID == 0 && +#endif WalReceiverPID == 0 && BgWriterPID == 0 && (CheckpointerPID == 0 || @@@ -3975,15 -3818,11 +4047,19 @@@ TerminateChildren(int signal { SignalChildren(signal); if (StartupPID != 0) + { signal_child(StartupPID, signal); + if (signal == SIGQUIT || signal == SIGKILL) + StartupStatus = STARTUP_SIGNALED; + } +#ifdef PGXC /* PGXC_COORD */ + if (PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGQUIT); +#endif +#ifdef XCP + if (ClusterMonPID != 0) + signal_child(ClusterMonPID, signal); +#endif if (BgWriterPID != 0) signal_child(BgWriterPID, signal); if (CheckpointerPID != 0) diff --cc src/backend/storage/lmgr/lwlock.c index c918f090da,7b8fb71ead..d8294fd342 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@@ -10,19 -10,16 +10,21 @@@ * locking should be done with the full lock manager --- which depends on * LWLocks to protect its shared state. * - * In addition to exclusive and shared modes, lightweight locks can be used - * to wait until a variable changes value. The variable is initially set - * when the lock is acquired with LWLockAcquireWithVar, and can be updated + * In addition to exclusive and shared modes, lightweight locks can be used to + * wait until a variable changes value. The variable is initially not set + * when the lock is acquired with LWLockAcquire, i.e. it remains set to the + * value it was set to when the lock was released last, and can be updated * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar - * waits for the variable to be updated, or until the lock is free. The - * meaning of the variable is up to the caller, the lightweight lock code - * just assigns and compares it. + * waits for the variable to be updated, or until the lock is free. When + * releasing the lock with LWLockReleaseClearVar() the value can be set to an + * appropriate value for a free lock. The meaning of the variable is up to + * the caller, the lightweight lock code just assigns and compares it. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * diff --cc src/backend/utils/adt/ruleutils.c index f16854306d,51391f6a4e..9fe7ec2987 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@@ -380,15 -348,9 +379,13 @@@ static void make_ruledef(StringInfo buf int prettyFlags); static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags, int wrapColumn); - static void get_tablesample_def(TableSampleClause *tablesample, - deparse_context *context); static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, TupleDesc resultDesc, - int prettyFlags, int wrapColumn, int startIndent); + int prettyFlags, int wrapColumn, int startIndent +#ifdef PGXC + , bool finalise_aggregates, bool sortgroup_colno +#endif /* PGXC */ + ); static void get_values_def(List *values_lists, deparse_context *context); static void get_with_clause(Query *query, deparse_context *context); static void get_select_query_def(Query *query, deparse_context *context, @@@ -4416,224 -4234,7 +4415,180 @@@ make_viewdef(StringInfo buf, HeapTuple heap_close(ev_relation, AccessShareLock); } - /* ---------- - * get_tablesample_def - Convert TableSampleClause back to SQL - * ---------- - */ - static void - get_tablesample_def(TableSampleClause *tablesample, deparse_context *context) - { - StringInfo buf = context->buf; - HeapTuple tuple; - Form_pg_tablesample_method tsm; - char *tsmname; - int nargs; - ListCell *l; - - /* Load the tablesample method */ - tuple = SearchSysCache1(TABLESAMPLEMETHODOID, ObjectIdGetDatum(tablesample->tsmid)); - if (!HeapTupleIsValid(tuple)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("cache lookup failed for tablesample method %u", - tablesample->tsmid))); - - tsm = (Form_pg_tablesample_method) GETSTRUCT(tuple); - tsmname = NameStr(tsm->tsmname); - appendStringInfo(buf, " TABLESAMPLE %s (", quote_identifier(tsmname)); - - ReleaseSysCache(tuple); - - nargs = 0; - foreach(l, tablesample->args) - { - if (nargs++ > 0) - appendStringInfoString(buf, ", "); - get_rule_expr((Node *) lfirst(l), context, true); - } - appendStringInfoChar(buf, ')'); - - if (tablesample->repeatable != NULL) - { - appendStringInfoString(buf, " REPEATABLE ("); - get_rule_expr(tablesample->repeatable, context, true); - appendStringInfoChar(buf, ')'); - } - } +#ifdef PGXC +/* ---------- + * deparse_query - Parse back one query parsetree + * + * Purpose of this function is to build up statement for a RemoteQuery + * It just calls get_query_def without pretty print flags + * ---------- + */ +void +deparse_query(Query *query, StringInfo buf, List *parentnamespace, + bool finalise_aggs, bool sortgroup_colno) +{ + get_query_def(query, buf, parentnamespace, NULL, 0, 0, 0, finalise_aggs, + sortgroup_colno); +} + +/* code borrowed from get_insert_query_def */ +void +get_query_def_from_valuesList(Query *query, StringInfo buf) +{ + + RangeTblEntry *select_rte = NULL; + RangeTblEntry *values_rte = NULL; + RangeTblEntry *rte; + char *sep; + ListCell *values_cell; + ListCell *l; + List *strippedexprs; + deparse_context context; + deparse_namespace dpns; + + /* + * Before we begin to examine the query, acquire locks on referenced + * relations, and fix up deleted columns in JOIN RTEs. This ensures + * consistent results. Note we assume it's OK to scribble on the passed + * querytree! + */ + AcquireRewriteLocks(query, false, false); + + context.buf = buf; + context.namespaces = NIL; + context.windowClause = NIL; + context.windowTList = NIL; + context.varprefix = (list_length(query->rtable) != 1); + context.prettyFlags = 0; + context.indentLevel = 0; + context.wrapColumn = 0; + + dpns.rtable = query->rtable; + dpns.ctes = query->cteList; + dpns.planstate = NULL; + dpns.ancestors = NIL; + dpns.outer_planstate = dpns.inner_planstate = NULL; + dpns.remotequery = false; + + /* + * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be + * a single RTE for the SELECT or VALUES. + */ + foreach(l, query->rtable) + { + rte = (RangeTblEntry *) lfirst(l); + + if (rte->rtekind == RTE_SUBQUERY) + { + if (select_rte) + elog(ERROR, "too many subquery RTEs in INSERT"); + select_rte = rte; + } + + if (rte->rtekind == RTE_VALUES) + { + if (values_rte) + elog(ERROR, "too many values RTEs in INSERT"); + values_rte = rte; + } + } + if (select_rte && values_rte) + elog(ERROR, "both subquery and values RTEs in INSERT"); + + /* + * Start the query with INSERT INTO relname + */ + rte = rt_fetch(query->resultRelation, query->rtable); + Assert(rte->rtekind == RTE_RELATION); + + appendStringInfo(buf, "INSERT INTO %s (", + generate_relation_name(rte->relid, NIL)); + + /* + * Add the insert-column-names list. To handle indirection properly, we + * need to look for indirection nodes in the top targetlist (if it's + * INSERT ... SELECT or INSERT ... single VALUES), or in the first + * expression list of the VALUES RTE (if it's INSERT ... multi VALUES). We + * assume that all the expression lists will have similar indirection in + * the latter case. + */ + if (values_rte) + values_cell = list_head((List *) linitial(values_rte->values_lists)); + else + values_cell = NULL; + strippedexprs = NIL; + sep = ""; + foreach(l, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + elog(DEBUG1, "targetEntry type is %d\n)", tle->expr->type); + if (tle->resjunk || !IsA(tle->expr, Var)) + continue; /* ignore junk entries */ + + appendStringInfoString(buf, sep); + sep = ", "; + + /* + * Put out name of target column; look in the catalogs, not at + * tle->resname, since resname will fail to track RENAME. + */ + appendStringInfoString(buf,quote_identifier(get_relid_attribute_name(rte->relid, tle->resno))); + + /* + * Print any indirection needed (subfields or subscripts), and strip + * off the top-level nodes representing the indirection assignments. + */ + if (values_cell) + { + /* we discard the stripped expression in this case */ + processIndirection((Node *) lfirst(values_cell), &context, true); + values_cell = lnext(values_cell); + } + else + { + /* we keep a list of the stripped expressions in this case */ + strippedexprs = lappend(strippedexprs, processIndirection((Node *) tle->expr, &context, true)); + } + } + appendStringInfo(buf, ") "); + + if (select_rte) + { + /* Add the SELECT */ + get_query_def(select_rte->subquery, buf, NIL, NULL, + context.prettyFlags, context.wrapColumn, + context.indentLevel, + context.finalise_aggs, context.sortgroup_colno); + } + else if (values_rte) + { + /* A WITH clause is possible here */ + get_with_clause(query, &context); + /* Add the multi-VALUES expression lists */ + get_values_def(values_rte->values_lists, &context); + } + else + { + /* A WITH clause is possible here */ + get_with_clause(query, &context); + /* Add the single-VALUES expression list */ + appendContextKeyword(&context, "VALUES (", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); + get_rule_expr((Node *) strippedexprs, &context, false); + appendStringInfoChar(buf, ')'); + } + + /* Add RETURNING if present */ + if (query->returningList) + { + appendContextKeyword(&context, " RETURNING", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_target_list(query->returningList, &context, NULL); + } +} +#endif /* ---------- * get_query_def - Parse back one query parsetree * diff --cc src/backend/utils/cache/lsyscache.c index f60f20fdf4,1dc293297d..ad2fc92b52 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@@ -37,13 -32,7 +37,12 @@@ #include "catalog/pg_range.h" #include "catalog/pg_statistic.h" #include "catalog/pg_transform.h" - #include "catalog/pg_tablesample_method.h" #include "catalog/pg_type.h" +#ifdef PGXC +#include "catalog/pgxc_class.h" +#include "catalog/pgxc_node.h" +#include "catalog/pgxc_group.h" +#endif #include "miscadmin.h" #include "nodes/makefuncs.h" #include "utils/array.h" diff --cc src/backend/utils/init/miscinit.c index 136b81cb71,fb3cb6eb3d..cd6536f25a --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@@ -1138,15 -1022,13 +1154,19 @@@ CreateLockFile(const char *filename, bo if (lock_files == NIL) on_proc_exit(UnlinkLockFiles, 0); - lock_files = lappend(lock_files, pstrdup(filename)); + /* + * Use lcons so that the lock files are unlinked in reverse order of + * creation; this is critical! + */ + lock_files = lcons(pstrdup(filename), lock_files); } +void +ForgetLockFiles() +{ + lock_files = NIL; +} + /* * Create the data directory lockfile. * diff --cc src/backend/utils/misc/guc.c index bf80a0ed60,ada3e1af93..e735c0ac17 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@@ -429,35 -379,6 +429,19 @@@ static const struct config_enum_entry h {NULL, 0, false} }; - /* - * Although only "on", "off", and "force" are documented, we - * accept all the likely variants of "on" and "off". - */ - static const struct config_enum_entry row_security_options[] = { - {"on", ROW_SECURITY_ON, false}, - {"off", ROW_SECURITY_OFF, false}, - {"force", ROW_SECURITY_FORCE, false}, - {"true", ROW_SECURITY_ON, true}, - {"false", ROW_SECURITY_OFF, true}, - {"yes", ROW_SECURITY_ON, true}, - {"no", ROW_SECURITY_OFF, true}, - {"1", ROW_SECURITY_ON, true}, - {"0", ROW_SECURITY_OFF, true}, - {NULL, 0, false} - }; + +#ifdef XCP +/* + * Set global-snapshot source. 'gtm' is default, but user can choose + * 'coordinator' for performance improvement at the cost of reduced consistency + */ +static const struct config_enum_entry global_snapshot_source_options[] = { + {"gtm", GLOBAL_SNAPSHOT_SOURCE_GTM, true}, + {"coordinator", GLOBAL_SNAPSHOT_SOURCE_COORDINATOR, true}, + {NULL, 0, false} +}; +#endif + /* * Options for enum values stored in other modules */ @@@ -519,11 -436,6 +504,9 @@@ int tcp_keepalives_idle int tcp_keepalives_interval; int tcp_keepalives_count; +#ifdef XCP +char *storm_catalog_remap_string; +#endif - int row_security; - /* * This really belongs in pg_shmem.c, but is defined here so that it doesn't * need to be duplicated in all the different implementations of pg_shmem.c. @@@ -4054,30 -3624,6 +4037,20 @@@ static struct config_enum ConfigureName NULL, NULL, NULL }, - { - {"row_security", PGC_USERSET, CONN_AUTH_SECURITY, - gettext_noop("Enable row security."), - gettext_noop("When enabled, row security will be applied to all users.") - }, - &row_security, - ROW_SECURITY_ON, row_security_options, - NULL, NULL, NULL - }, - +#ifdef XCP + { + {"global_snapshot_source", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Set preferred source of a snapshot."), + gettext_noop("When set to 'coordinator', a snapshot is taken at " + "the coordinator at the risk of reduced consistency. " + "Default is 'gtm'") + }, + &GlobalSnapshotSource, + GLOBAL_SNAPSHOT_SOURCE_GTM, global_snapshot_source_options, + NULL, NULL, NULL + }, +#endif + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL diff --cc src/bin/pgbench/pgbench.c index 687becdead,f7125d25b0..4b959399d0 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@@ -431,13 -388,9 +431,12 @@@ usage(void " -D, --define=VARNAME=VALUE\n" " define variable for use by custom script\n" " -f, --file=FILENAME read transaction script from FILENAME\n" +#ifdef PGXC + " -k query with default key and additional key branch id (bid)\n" +#endif " -j, --jobs=NUM number of threads (default: 1)\n" " -l, --log write transaction times to log file\n" - " -L, --latency-limit=NUM count transactions lasting more than NUM ms\n" - " as late.\n" + " -L, --latency-limit=NUM count transactions lasting more than NUM ms as late\n" " -M, --protocol=simple|extended|prepared\n" " protocol for submitting queries (default: simple)\n" " -n, --no-vacuum do not run VACUUM before tests\n" diff --cc src/include/catalog/catversion.h index c03ebbbb2a,e3b567a0e3..33c27d5d48 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@@ -53,6 -53,6 +53,6 @@@ */ /* yyyymmddN */ - #define CATALOG_VERSION_NO 201509041 -#define CATALOG_VERSION_NO 201510051 ++#define CATALOG_VERSION_NO 201512101 #endif diff --cc src/include/catalog/dependency.h index 523639ebeb,fbcf904432..ddd268d40a --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@@ -142,14 -146,9 +147,12 @@@ typedef enum ObjectClas OCLASS_ROLE, /* pg_authid */ OCLASS_DATABASE, /* pg_database */ OCLASS_TBLSPACE, /* pg_tablespace */ - OCLASS_FDW, /* pg_foreign_data_wrapper */ - OCLASS_FOREIGN_SERVER, /* pg_foreign_server */ - OCLASS_USER_MAPPING, /* pg_user_mapping */ + OCLASS_FDW, /* pg_foreign_data_wrapper */ + OCLASS_FOREIGN_SERVER, /* pg_foreign_server */ + OCLASS_USER_MAPPING, /* pg_user_mapping */ +#ifdef PGXC + OCLASS_PGXC_CLASS, /* pgxc_class */ - OCLASS_PGXC_NODE, /* pgxc_node */ - OCLASS_PGXC_GROUP, /* pgxc_group */ +#endif OCLASS_DEFACL, /* pg_default_acl */ OCLASS_EXTENSION, /* pg_extension */ OCLASS_EVENT_TRIGGER, /* pg_event_trigger */ diff --cc src/include/pg_config.h.win32 index c50889b5fc,9664e1f1ed..8bc34c7e4f --- a/src/include/pg_config.h.win32 +++ b/src/include/pg_config.h.win32 @@@ -557,16 -557,16 +557,16 @@@ #define MEMSET_LOOP_LIMIT 1024 /* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "pgsql-bugs@postgresql.org" +#define PACKAGE_BUGREPORT "postgres-xl-bugs@lists.sourceforge.net" /* Define to the full name of this package. */ -#define PACKAGE_NAME "PostgreSQL" +#define PACKAGE_NAME "Postgres-XL" /* Define to the full name and version of this package. */ - #define PACKAGE_STRING "Postgres-XL 9.5alpha1" -#define PACKAGE_STRING "PostgreSQL 9.5beta1" ++#define PACKAGE_STRING "Postgres-XL 9.5beta1" /* Define to the version of this package. */ - #define PACKAGE_VERSION "9.5alpha1" + #define PACKAGE_VERSION "9.5beta1" /* Define to the name of a signed 128-bit integer type. */ #undef PG_INT128_TYPE diff --cc src/include/storage/lwlock.h index 68bd919d0e,e360fc0393..9b10bac013 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@@ -137,25 -132,11 +137,27 @@@ extern PGDLLIMPORT LWLockPadded *MainLW #define AutoFileLock (&MainLWLockArray[35].lock) #define ReplicationSlotAllocationLock (&MainLWLockArray[36].lock) #define ReplicationSlotControlLock (&MainLWLockArray[37].lock) +#ifdef PGXC +#define BarrierLock (&MainLWLockArray[38].lock) +#define NodeTableLock (&MainLWLockArray[39].lock) +#define SQueuesLock (&MainLWLockArray[40].lock) +#define ClusterMonitorLock (&MainLWLockArray[41].lock) +#define CommitTsControlLock (&MainLWLockArray[42].lock) +#define CommitTsLock (&MainLWLockArray[43].lock) +#define ReplicationOriginLock (&MainLWLockArray[44].lock) ++#define MultiXactTruncationLock (&MainLWLockArray[45].lock) +#else #define CommitTsControlLock (&MainLWLockArray[38].lock) #define CommitTsLock (&MainLWLockArray[39].lock) #define ReplicationOriginLock (&MainLWLockArray[40].lock) + #define MultiXactTruncationLock (&MainLWLockArray[41].lock) +#endif + +#ifdef PGXC - #define NUM_INDIVIDUAL_LWLOCKS 45 ++#define NUM_INDIVIDUAL_LWLOCKS 46 +#else - #define NUM_INDIVIDUAL_LWLOCKS 41 + #define NUM_INDIVIDUAL_LWLOCKS 42 +#endif /* * It's a bit odd to declare NUM_BUFFER_PARTITIONS and NUM_LOCK_PARTITIONS diff --cc src/include/utils/lsyscache.h index 819383a16b,9711538432..641bc7ecc4 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@@ -181,23 -154,9 +181,22 @@@ extern void free_attstatsslot(Oid attty Datum *values, int nvalues, float4 *numbers, int nnumbers); extern char *get_namespace_name(Oid nspid); +#ifdef XCP +extern Oid get_namespaceid(const char *nspname); +extern char *get_typ_name(Oid typid); +extern Oid get_typ_namespace(Oid typid); +extern Oid get_typname_typid(const char *typname, Oid typnamespace); +extern Oid get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp); +extern Oid get_opnamespace(Oid opno); +extern Oid get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp); +#endif extern char *get_namespace_name_or_temp(Oid nspid); extern Oid get_range_subtype(Oid rangeOid); - extern char *get_tablesample_method_name(Oid tsmid); +#ifdef XCP +extern Oid get_tablesample_method_id(const char *methodname); +#endif + #define type_is_array(typid) (get_element_type(typid) != InvalidOid) /* type_is_array_domain accepts both plain arrays and domains over arrays */ #define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid) diff --cc src/include/utils/plancache.h index 5a5759ff54,4b9a0c68ce..9cfe1a1622 --- a/src/include/utils/plancache.h +++ b/src/include/utils/plancache.h @@@ -114,12 -109,8 +114,11 @@@ typedef struct CachedPlanSourc double generic_cost; /* cost of generic plan, or -1 if not known */ double total_custom_cost; /* total cost of custom plans so far */ int num_custom_plans; /* number of plans included in total */ +#ifdef PGXC + char *stmt_name; /* If set, this is a copy of prepared stmt name */ +#endif - bool hasRowSecurity; /* planned with row security? */ - int row_security_env; /* row security setting when planned */ - bool rowSecurityDisabled; /* is row security disabled? */ + bool hasRowSecurity; /* planned with row security? */ + bool row_security_env; /* row security setting when planned */ } CachedPlanSource; /* diff --cc src/test/regress/expected/alter_table.out index 8fb2894b1c,f78a4314e7..67f375ad1f --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@@ -433,21 -426,31 +433,34 @@@ explain (costs off, nodes off) select -- after validation, the constraint should be used alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check; -explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; - QUERY PLAN ---------------------------------------------------------------------------- +explain (costs off, nodes off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------------- Append - -> Seq Scan on nv_parent - Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) - -> Seq Scan on nv_child_2010 - Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) - -> Seq Scan on nv_child_2009 - Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) -(7 rows) + -> Remote Subquery Scan on all + -> Seq Scan on nv_parent + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Remote Subquery Scan on all + -> Seq Scan on nv_child_2010 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Remote Subquery Scan on all + -> Seq Scan on nv_child_2009 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) +(10 rows) + -- add an inherited NOT VALID constraint + alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid; + \d nv_child_2009 + Table "public.nv_child_2009" + Column | Type | Modifiers + --------+------+----------- + d | date | + Check constraints: + "nv_child_2009_d_check" CHECK (d >= '01-01-2009'::date AND d <= '12-31-2009'::date) + "nv_parent_d_check" CHECK (d >= '01-01-2001'::date AND d <= '12-31-2099'::date) NOT VALID + Inherits: nv_parent + + -- we leave nv_parent and children around to help test pg_dump logic -- Foreign key adding test with mixed types -- Note: these tables are TEMP to avoid name conflicts when this test -- is run in parallel with foreign_key.sql. diff --cc src/test/regress/expected/gist.out index 31e6be3733,c7181b0397..cf0fb39462 --- a/src/test/regress/expected/gist.out +++ b/src/test/regress/expected/gist.out @@@ -62,17 -61,16 +62,17 @@@ select p from gist_tbl where p <@ box(p -- Also test an index-only knn-search explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) -order by p <-> point(0.201, 0.201); - QUERY PLAN --------------------------------------------------------- - Index Only Scan using gist_tbl_point_index on gist_tbl - Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) - Order By: (p <-> '(0.201,0.201)'::point) -(3 rows) +order by p <-> point(0.2, 0.2); + QUERY PLAN +-------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Index Only Scan using gist_tbl_point_index on gist_tbl + Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) - Order By: (p <-> '(0.2,0.2)'::point) ++ Order By: (p <-> '(0.201,0.201)'::point) +(4 rows) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) - order by p <-> point(0.2, 0.2); + order by p <-> point(0.201, 0.201); p ------------- (0.2,0.2) @@@ -91,17 -89,16 +91,17 @@@ -- Check commuted case as well explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) -order by point(0.101, 0.101) <-> p; - QUERY PLAN --------------------------------------------------------- - Index Only Scan using gist_tbl_point_index on gist_tbl - Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) - Order By: (p <-> '(0.101,0.101)'::point) -(3 rows) +order by point(0.1, 0.1) <-> p; + QUERY PLAN +-------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) + -> Index Only Scan using gist_tbl_point_index on gist_tbl + Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) - Order By: (p <-> '(0.1,0.1)'::point) ++ Order By: (p <-> '(0.101,0.101)'::point) +(4 rows) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) - order by point(0.1, 0.1) <-> p; + order by point(0.101, 0.101) <-> p; p ------------- (0.1,0.1) diff --cc src/test/regress/expected/join.out index 1d3ea7b806,087cddf03a..e012bd540c --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@@ -2906,19 -2907,89 +2978,90 @@@ explain (costs off select * from tenk1, int8_tbl a, int8_tbl b where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2; - QUERY PLAN ---------------------------------------------------------------------- - Nested Loop - -> Seq Scan on int8_tbl b - Filter: (q2 = 2) + QUERY PLAN +--------------------------------------------------------------------------- + Remote Subquery Scan on all (datanode_1,datanode_2) -> Nested Loop - -> Seq Scan on int8_tbl a - Filter: (q2 = 1) - -> Index Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand = a.q1) AND (tenthous = b.q1)) -(8 rows) + -> Seq Scan on int8_tbl b + Filter: (q2 = 2) + -> Nested Loop + -> Seq Scan on int8_tbl a + Filter: (q2 = 1) + -> Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = a.q1) AND (tenthous = b.q1)) +(9 rows) + -- + -- test a corner case in which we shouldn't apply the star-schema optimization + -- + explain (costs off) + select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from + tenk1 t1 + inner join int4_tbl i1 + left join (select v1.x2, v2.y1, 11 AS d1 + from (values(1,0)) v1(x1,x2) + left join (values(3,1)) v2(y1,y2) + on v1.x1 = v2.y2) subq1 + on (i1.f1 = subq1.x2) + on (t1.unique2 = subq1.d1) + left join tenk1 t2 + on (subq1.y1 = t2.unique1) + where t1.unique2 < 42 and t1.stringu1 > t2.stringu2; + QUERY PLAN + ----------------------------------------------------------------------- + Nested Loop + Join Filter: (t1.stringu1 > t2.stringu2) + -> Nested Loop + Join Filter: ((0) = i1.f1) + -> Nested Loop + -> Nested Loop + Join Filter: ((1) = (1)) + -> Result + -> Result + -> Index Scan using tenk1_unique2 on tenk1 t1 + Index Cond: ((unique2 = (11)) AND (unique2 < 42)) + -> Seq Scan on int4_tbl i1 + -> Index Scan using tenk1_unique1 on tenk1 t2 + Index Cond: (unique1 = (3)) + (14 rows) + + select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from + tenk1 t1 + inner join int4_tbl i1 + left join (select v1.x2, v2.y1, 11 AS d1 + from (values(1,0)) v1(x1,x2) + left join (values(3,1)) v2(y1,y2) + on v1.x1 = v2.y2) subq1 + on (i1.f1 = subq1.x2) + on (t1.unique2 = subq1.d1) + left join tenk1 t2 + on (subq1.y1 = t2.unique1) + where t1.unique2 < 42 and t1.stringu1 > t2.stringu2; + unique2 | stringu1 | unique1 | stringu2 + ---------+----------+---------+---------- + 11 | WFAAAA | 3 | LKIAAA + (1 row) + + -- variant that isn't quite a star-schema case + select ss1.d1 from + tenk1 as t1 + inner join tenk1 as t2 + on t1.tenthous = t2.ten + inner join + int8_tbl as i8 + left join int4_tbl as i4 + inner join (select 64::information_schema.cardinal_number as d1 + from tenk1 t3, + lateral (select abs(t3.unique1) + random()) ss0(x) + where t3.fivethous < 0) as ss1 + on i4.f1 = ss1.d1 + on i8.q1 = i4.f1 + on t1.tenthous = ss1.d1 + where t1.unique1 < i4.f1; + d1 + ---- + (0 rows) + -- -- test extraction of restriction OR clauses from join OR clause -- (we used to only do this for indexable clauses) @@@ -3699,11 -3893,68 +4087,68 @@@ SELECT * FRO ---+------------------+-------------------+------------------ 1 | 123 | 456 | 123 1 | 123 | 4567890123456789 | 123 + 1 | 4567890123456789 | -4567890123456789 | 4567890123456789 1 | 4567890123456789 | 123 | 42 1 | 4567890123456789 | 4567890123456789 | 4567890123456789 - 1 | 4567890123456789 | -4567890123456789 | 4567890123456789 (5 rows) + rollback; + -- another join removal bug: we must clean up correctly when removing a PHV + begin; + create temp table uniquetbl (f1 text unique); + explain (costs off) + select t1.* from + uniquetbl as t1 + left join (select *, '***'::text as d1 from uniquetbl) t2 + on t1.f1 = t2.f1 + left join uniquetbl t3 + on t2.d1 = t3.f1; + QUERY PLAN + -------------------------- + Seq Scan on uniquetbl t1 + (1 row) + + explain (costs off) + select t0.* + from + text_tbl t0 + left join + (select case t1.ten when 0 then 'doh!'::text else null::text end as case1, + t1.stringu2 + from tenk1 t1 + join int4_tbl i4 ON i4.f1 = t1.unique2 + left join uniquetbl u1 ON u1.f1 = t1.string4) ss + on t0.f1 = ss.case1 + where ss.stringu2 !~* ss.case1; + QUERY PLAN + -------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (CASE t1.ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END = t0.f1) + -> Nested Loop + -> Seq Scan on int4_tbl i4 + -> Index Scan using tenk1_unique2 on tenk1 t1 + Index Cond: (unique2 = i4.f1) + Filter: (stringu2 !~* CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) + -> Materialize + -> Seq Scan on text_tbl t0 + (9 rows) + + select t0.* + from + text_tbl t0 + left join + (select case t1.ten when 0 then 'doh!'::text else null::text end as case1, + t1.stringu2 + from tenk1 t1 + join int4_tbl i4 ON i4.f1 = t1.unique2 + left join uniquetbl u1 ON u1.f1 = t1.string4) ss + on t0.f1 = ss.case1 + where ss.stringu2 !~* ss.case1; + f1 + ------ + doh! + (1 row) + rollback; -- bug #8444: we've historically allowed duplicate aliases within aliased JOINs select * from diff --cc src/test/regress/expected/tablesample.out index 12987e0be3,727a835439..628b3cfb55 --- a/src/test/regress/expected/tablesample.out +++ b/src/test/regress/expected/tablesample.out @@@ -1,89 -1,95 +1,93 @@@ - CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages - INSERT INTO test_tablesample SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i) ORDER BY i; - SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (10); + CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); + -- use fillfactor so we don't have to load too much data to get multiple pages + INSERT INTO test_tablesample + SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i); + SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (0); id ---- - 1 - 2 + 3 + 4 5 6 - 7 8 - 9 - 0 - 3 - 4 - 7 - (10 rows) + (6 rows) - SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (9999); + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (0); id ---- - 6 - 8 - 9 - 7 - (4 rows) - - SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100); - count - ------- - 10 - (1 row) + (0 rows) - SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); id ---- - 1 - 2 + 3 + 4 5 6 - 7 8 - 9 - 0 - 3 - 4 - 7 - (10 rows) + (6 rows) - SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (100); + SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (0); id ---- - 1 - 2 + 4 + 5 6 - 8 - 9 - 0 - 3 7 - (8 rows) + 8 + (5 rows) - SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); + SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (0); id ---- - 1 - 9 - 0 - (3 rows) + 7 + (1 row) + + -- 100% should give repeatable count results (ie, all rows) in any case + SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100); + count + ------- + 10 + (1 row) - CREATE VIEW test_tablesample_v1 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); - CREATE VIEW test_tablesample_v2 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); - SELECT pg_get_viewdef('test_tablesample_v1'::regclass); - pg_get_viewdef - -------------------------------------------------------------------------------- - SELECT test_tablesample.id + - FROM test_tablesample TABLESAMPLE system (((10 * 2))::real) REPEATABLE (2); + SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100) REPEATABLE (1+2); + count + ------- + 10 (1 row) - SELECT pg_get_viewdef('test_tablesample_v2'::regclass); - pg_get_viewdef - ----------------------------------------------------------- - SELECT test_tablesample.id + - FROM test_tablesample TABLESAMPLE system ((99)::real); + SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100) REPEATABLE (0.4); + count + ------- + 10 (1 row) + CREATE VIEW test_tablesample_v1 AS + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); + CREATE VIEW test_tablesample_v2 AS + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); + \d+ test_tablesample_v1 + View "public.test_tablesample_v1" + Column | Type | Modifiers | Storage | Description + --------+---------+-----------+---------+------------- + id | integer | | plain | + View definition: + SELECT test_tablesample.id + FROM test_tablesample TABLESAMPLE system ((10 * 2)) REPEATABLE (2); + + \d+ test_tablesample_v2 + View "public.test_tablesample_v2" + Column | Type | Modifiers | Storage | Description + --------+---------+-----------+---------+------------- + id | integer | | plain | + View definition: + SELECT test_tablesample.id + FROM test_tablesample TABLESAMPLE system (99); + + -- check a sampled query doesn't affect cursor in progress BEGIN; - DECLARE tablesample_cur CURSOR FOR SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); + DECLARE tablesample_cur CURSOR FOR + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); FETCH FIRST FROM tablesample_cur; id ---- diff --cc src/test/regress/serial_schedule index 59f4012a29,985f6c9f02..2568d6d641 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@@ -158,30 -158,3 +161,29 @@@ test: wit test: xml test: event_trigger test: stats - test: tablesample +test: xc_create_function +test: xc_groupby +test: xc_distkey +test: xc_having +test: xc_temp +test: xc_remote +test: xc_node +test: xc_FQS +test: xc_FQS_join +test: xc_misc +test: xc_copy +#test: xc_for_update +# crash when locking the rows. To be investigated and probably block a feature with "not supported" +test: xc_alter_table +test: xc_sequence +test: xc_prepared_xacts +test: xc_notrans_block +test: xl_primary_key +test: xl_foreign_key +test: xl_distribution_column_types +test: xl_alter_table +test: xl_distribution_column_types_modulo +test: xl_plan_pushdown +test: xl_functions +test: xl_limitations +test: xl_user_defined_functions diff --cc src/test/regress/sql/alter_table.sql index 7a04b883ef,ace5b6a25c..140eee93f6 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@@ -332,14 -332,18 +332,18 @@@ create table nv_child_2010 () inherits create table nv_child_2011 () inherits (nv_parent); alter table nv_child_2010 add check (d between '2010-01-01'::date and '2010-12-31'::date) not valid; alter table nv_child_2011 add check (d between '2011-01-01'::date and '2011-12-31'::date) not valid; -explain (costs off) select * from nv_parent where d between '2011-08-01' and '2011-08-31'; +explain (costs off, nodes off) select * from nv_parent where d between '2011-08-01' and '2011-08-31'; create table nv_child_2009 (check (d between '2009-01-01'::date and '2009-12-31'::date)) inherits (nv_parent); -explain (costs off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date; -explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; +explain (costs off, nodes off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date; +explain (costs off, nodes off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; -- after validation, the constraint should be used alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check; -explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; +explain (costs off, nodes off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + -- add an inherited NOT VALID constraint + alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid; + \d nv_child_2009 + -- we leave nv_parent and children around to help test pg_dump logic -- Foreign key adding test with mixed types diff --cc src/test/regress/sql/arrays.sql index 3ea341476a,b1dd651440..640eab1355 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@@ -315,8 -304,21 +315,21 @@@ insert into arr_tbl values ('{1,2,10}') set enable_seqscan to off; set enable_bitmapscan to off; -select * from arr_tbl where f1 > '{1,2,3}' and f1 <= '{1,5,3}'; -select * from arr_tbl where f1 >= '{1,2,3}' and f1 < '{1,5,3}'; +select * from arr_tbl where f1 > '{1,2,3}' and f1 <= '{1,5,3}' ORDER BY 1; +select * from arr_tbl where f1 >= '{1,2,3}' and f1 < '{1,5,3}' ORDER BY 1; + + -- test ON CONFLICT DO UPDATE with arrays + create temp table arr_pk_tbl (pk int4 primary key, f1 int[]); + insert into arr_pk_tbl values (1, '{1,2,3}'); + insert into arr_pk_tbl values (1, '{3,4,5}') on conflict (pk) + do update set f1[1] = excluded.f1[1], f1[3] = excluded.f1[3] + returning pk, f1; + insert into arr_pk_tbl(pk, f1[1:2]) values (1, '{6,7,8}') on conflict (pk) + do update set f1[1] = excluded.f1[1], + f1[2] = excluded.f1[2], + f1[3] = excluded.f1[3] + returning pk, f1; + -- note: if above selects don't produce the expected tuple order, -- then you didn't get an indexscan plan, and something is busted. reset enable_seqscan; diff --cc src/test/regress/sql/numeric.sql index 0eaaf99e55,5405ffadc7..081b421ab6 --- a/src/test/regress/sql/numeric.sql +++ b/src/test/regress/sql/numeric.sql @@@ -809,8 -809,20 +809,20 @@@ INSERT INTO num_input_test(n1) VALUES ( INSERT INTO num_input_test(n1) VALUES (''); INSERT INTO num_input_test(n1) VALUES (' N aN '); -SELECT * FROM num_input_test; +SELECT * FROM num_input_test ORDER BY n1; + -- + -- Test some corner cases for multiplication + -- + + select 4790999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + + select 4789999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + + select 4770999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + + select 4769999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + -- -- Test some corner cases for division --