-PostgreSQL Database Management System
-(formerly known as Postgres, then as Postgres95)
+Postgres-XL Cluster Database Management System
- Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+Portions Copyright (c) 2012-2014, TransLattice, Inc.
+Portions Copyright (c) 2010-2013, Postgres-XC Development Group
+ Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+Portions Copyright (c) 2015-1016, 2ndQuadrant Limited
Portions Copyright (c) 1994, The Regents of the University of California
-PostgreSQL Database Management System
-=====================================
+Postgres-XL Database Management System
+======================================
-This directory contains the source code distribution of the PostgreSQL
+This directory contains the source code distribution of the Postgres-XL
database management system.
-PostgreSQL is an advanced object-relational database management system
-that supports an extended subset of the SQL standard, including
-transactions, foreign keys, subqueries, triggers, user-defined types
-and functions. This distribution also contains C language bindings.
+Postgres-XL is an advanced object-relational cluster database management
+ system that supports an extended subset of the SQL standard, including
+transactions, foreign keys, user-defined types and functions. This
+distribution also contains C language bindings.
-PostgreSQL has many language interfaces, many of which are listed here:
+Postgres-XL has many language interfaces similar to PostgreSQL, many of
+which are listed here:
- http://www.postgresql.org/download
+ https://www.postgresql.org/download
See the file INSTALL for instructions on how to build and install
-PostgreSQL. That file also lists supported operating systems and
+Postgres-XL. That file also lists supported operating systems and
hardware platforms and contains information regarding any other
-software packages that are required to build or run the PostgreSQL
+software packages that are required to build or run the PostgreSQL-XL
system. Copyright and license information can be found in the
file COPYRIGHT. A comprehensive documentation set is included in this
distribution; it can be read as described in the installation
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
- # Generated by GNU Autoconf 2.69 for PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1).
-# Generated by GNU Autoconf 2.69 for PostgreSQL 10beta1.
++# Generated by GNU Autoconf 2.69 for PostgreSQL 10beta1 (Postgres-XL 10alpha1).
#
-# Report bugs to <pgsql-bugs@postgresql.org>.
+# Report bugs to <bugs@postgres-xl.org>.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
# Identity of this package.
PACKAGE_NAME='PostgreSQL'
PACKAGE_TARNAME='postgresql'
- PACKAGE_VERSION='9.6beta4 (Postgres-XL 9.6alpha1)'
- PACKAGE_XC_VERSION='9.6alpha1'
- PACKAGE_STRING='PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1)'
-PACKAGE_VERSION='10beta1'
-PACKAGE_STRING='PostgreSQL 10beta1'
++PACKAGE_VERSION='10beta1 (Postgres-XL 10alpha1)'
++PACKAGE_XC_VERSION='10alpha1'
++PACKAGE_STRING='PostgreSQL 10beta1 (Postgres-XL 10alpha1)'
PACKAGE_URL=''
ac_unique_file="src/backend/access/common/heaptuple.c"
with_wal_segsize
with_CC
enable_depend
+enable_genmsgids
enable_cassert
enable_thread_safety
+ with_icu
with_tcl
with_tclconfig
with_perl
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
- \`configure' configures PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1) to adapt to many kinds of systems.
-\`configure' configures PostgreSQL 10beta1 to adapt to many kinds of systems.
++\`configure' configures PostgreSQL 10beta1 (Postgres-XL 10alpha1) to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of PostgreSQL 9.6beta4 (Postgres-XL 9.6alpha1):";;
- short | recursive ) echo "Configuration of PostgreSQL 10beta1:";;
++ short | recursive ) echo "Configuration of PostgreSQL 10beta1 (Postgres-XL 10alpha1):";;
esac
cat <<\_ACEOF
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
- PostgreSQL configure 9.6beta4 (Postgres-XL 9.6alpha1)
-PostgreSQL configure 10beta1
++PostgreSQL configure 10beta1 (Postgres-XL 10alpha1)
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
- It was created by PostgreSQL $as_me 9.6beta4 (Postgres-XL 9.6alpha1), which was
-It was created by PostgreSQL $as_me 10beta1, which was
++It was created by PostgreSQL $as_me 10beta1 (Postgres-XL 10alpha1), which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
- This file was extended by PostgreSQL $as_me 9.6beta4 (Postgres-XL 9.6alpha1), which was
-This file was extended by PostgreSQL $as_me 10beta1, which was
++This file was extended by PostgreSQL $as_me 10beta1 (Postgres-XL 10alpha1), which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
- PostgreSQL config.status 9.6beta4 (Postgres-XL 9.6alpha1)
-PostgreSQL config.status 10beta1
++PostgreSQL config.status 10beta1 (Postgres-XL 10alpha1)
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
dnl
m4_pattern_forbid(^PGAC_)dnl to catch undefined macros
m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
Untested combinations of 'autoconf' and PostgreSQL versions are not
test_decoding \
tsm_system_rows \
tsm_system_time \
- tsearch2 \
unaccent \
- vacuumlo
+ vacuumlo \
+ stormstats
ifeq ($(with_openssl),yes)
SUBDIRS += sslinfo
static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
static void pgss_ExecutorRun(QueryDesc *queryDesc,
ScanDirection direction,
- uint64 count);
+ uint64 count, bool execute_once);
static void pgss_ExecutorFinish(QueryDesc *queryDesc);
static void pgss_ExecutorEnd(QueryDesc *queryDesc);
- static void pgss_ProcessUtility(Node *parsetree, const char *queryString,
+ static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
ProcessUtilityContext context, ParamListInfo params,
- DestReceiver *dest, char *completionTag);
+ QueryEnvironment *queryEnv,
+ DestReceiver *dest,
+#ifdef PGXC
+ bool sentToRemote,
+#endif /* PGXC */
+ char *completionTag);
static uint32 pgss_hash_fn(const void *key, Size keysize);
static int pgss_match_fn(const void *key1, const void *key2, Size keysize);
- static uint32 pgss_hash_string(const char *str);
+ static uint32 pgss_hash_string(const char *str, int len);
static void pgss_store(const char *query, uint32 queryId,
+ int query_location, int query_len,
double total_time, uint64 rows,
const BufferUsage *bufusage,
pgssJumbleState *jstate);
* ProcessUtility hook
*/
static void
- pgss_ProcessUtility(Node *parsetree, const char *queryString,
+ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
- ProcessUtilityContext context,
- ParamListInfo params, QueryEnvironment *queryEnv,
- DestReceiver *dest, char *completionTag)
+ ProcessUtilityContext context, ParamListInfo params,
++ QueryEnvironment *queryEnv,
+ DestReceiver *dest,
+#ifdef PGXC
+ bool sentToRemote,
+#endif /* PGXC */
+ char *completionTag)
{
+ Node *parsetree = pstmt->utilityStmt;
+
/*
* If it's an EXECUTE statement, we don't track it and don't increment the
* nesting level. This allows the cycles to be charged to the underlying
PG_TRY();
{
if (prev_ProcessUtility)
- prev_ProcessUtility(parsetree, queryString,
- context, params,
+ prev_ProcessUtility(pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif /* PGXC */
+ completionTag);
else
- standard_ProcessUtility(parsetree, queryString,
- context, params,
+ standard_ProcessUtility(pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif /* PGXC */
+ completionTag);
nested_level--;
}
PG_CATCH();
else
{
if (prev_ProcessUtility)
- prev_ProcessUtility(parsetree, queryString,
- context, params,
+ prev_ProcessUtility(pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif /* PGXC */
+ completionTag);
else
- standard_ProcessUtility(parsetree, queryString,
- context, params,
+ standard_ProcessUtility(pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif /* PGXC */
+ completionTag);
}
}
const char *queryString,
ProcessUtilityContext context,
ParamListInfo params,
+ QueryEnvironment *queryEnv,
DestReceiver *dest,
+#ifdef PGXC
+ bool sentToRemote,
+#endif /* PGXC */
char *completionTag)
{
+ Node *parsetree = pstmt->utilityStmt;
sepgsql_context_info_t saved_context_info = sepgsql_context_info;
ListCell *cell;
}
if (next_ProcessUtility_hook)
- (*next_ProcessUtility_hook) (parsetree, queryString,
- context, params,
+ (*next_ProcessUtility_hook) (pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif
+ completionTag);
else
- standard_ProcessUtility(parsetree, queryString,
- context, params,
+ standard_ProcessUtility(pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif
+ completionTag);
}
PG_CATCH();
{
Operating System (example: Linux 2.4.18) :
- PostgreSQL version (example: PostgreSQL 9.6beta4): Postgres-XL 9.6alpha1
- PostgreSQL version (example: PostgreSQL 10beta1): PostgreSQL 10beta1
++ PostgreSQL version (example: PostgreSQL 10beta1): Postgres-XL 10alpha1
Compiler used (example: gcc 3.3.5) :
<!ENTITY config SYSTEM "config.sgml">
<!ENTITY user-manag SYSTEM "user-manag.sgml">
<!ENTITY wal SYSTEM "wal.sgml">
+<!ENTITY add-node SYSTEM "add-node.sgml">
+<!ENTITY remove-node SYSTEM "remove-node.sgml">
+ <!ENTITY logical-replication SYSTEM "logical-replication.sgml">
<!-- programmer's guide -->
<!ENTITY bgworker SYSTEM "bgworker.sgml">
<!ENTITY sourcerepo SYSTEM "sourcerepo.sgml">
<!ENTITY release SYSTEM "release.sgml">
+ <!ENTITY release-10 SYSTEM "release-10.sgml">
<!ENTITY release-9.6 SYSTEM "release-9.6.sgml">
+<!ENTITY release-xl-9.5r1 SYSTEM "release-xl-9.5r1.sgml">
<!ENTITY release-9.5 SYSTEM "release-9.5.sgml">
<!ENTITY release-9.4 SYSTEM "release-9.4.sgml">
<!ENTITY release-9.3 SYSTEM "release-9.3.sgml">
needs to be archived.
</para>
+ <para>
+ Please note that these functions works just locally. To issue
+ these functions to another Coordinators or Datanodes, you should
+ issue these functions through <type>EXECUTE DIRECT</> statement.
+ </para>
+
<para>
- <function>pg_xlog_location_diff</> calculates the difference in bytes
- between two transaction log locations. It can be used with
+ <function>pg_wal_lsn_diff</> calculates the difference in bytes
+ between two write-ahead log locations. It can be used with
<structname>pg_stat_replication</structname> or some functions shown in
<xref linkend="functions-admin-backup-table"> to get the replication lag.
</para>
<term>Web Site</term>
<listitem>
<para>
- The <productname>Postgres-XL</productname>
- <ulink url="http://www.postgres-xl.org/">web site</ulink>
- The <productname>PostgreSQL</productname>
- <ulink url="https://www.postgresql.org">web site</ulink>
++ The <productname>Postgres-XL</productname>
++ <ulink url="http://www.postgres-xl.org">web site</ulink>
carries details on the latest release and other
information to make your work or play with
- <productname>PostgreSQL</productname> more productive.
+ <productname>Postgres-XL</productname> more productive.
</para>
</listitem>
</varlistentry>
<!-- doc/src/sgml/legal.sgml -->
- <date>2016</date>
+ <date>2017</date>
<copyright>
- <year>1996-2016</year>
+ <year>1996-2017</year>
<holder>The PostgreSQL Global Development Group</holder>
</copyright>
+<copyright>
+ <year>2014-2016</year>
+ <holder>Postgres-XL Development Group</holder>
+</copyright>
+<copyright>
+ <year>2009-2012</year>
+ <holder>Postgres-XC Development Group</holder>
+</copyright>
+<copyright>
+ <year>2012-2014</year>
+ <holder>TransLattice, Inc.</holder>
+</copyright>
+<copyright>
+ <year>2015-2016</year>
+ <holder>2ndQuadrant Ltd</holder>
+</copyright>
<legalnotice id="legalnotice">
<title>Legal Notice</title>
<secondary>of transaction IDs</secondary>
</indexterm>
+ <para>
+ Please note that this section describes the tasks of individual
+ Coordinators and Datanodes. It should be done for each of them.
+ </para>
+
<para>
- <productname>PostgreSQL</productname>'s MVCC transaction semantics
+ <productname>PostgreSQL</productname>'s
+ <link linkend="mvcc-intro">MVCC</link> transaction semantics
depend on being able to compare transaction ID (<acronym>XID</>)
numbers: a row version with an insertion XID greater than the current
transaction's XID is <quote>in the future</> and should not be visible
debugging purposes. All of these functions may be used only by superusers.
</para>
+ <para>
+ Functions of this module returns information about connecting Coordinators
+ locally. To get information from a specific a Datanode, you can use EXECUTE
+ DIRECT from a Coordinator.
+ </para>
+
<sect2>
- <title>Functions</title>
+ <title>General Functions</title>
<variablelist>
<varlistentry>
</para>
<para>
- By default public access is revoked from both of these, just in case there
- are security issues lurking.
+ By default use is restricted to superusers and members of the
+ <literal>pg_read_all_stats</literal> role. Access may be granted to others
+ using <command>GRANT</command>.
</para>
+ <para>
+ <filename>pg_buffercache</filename> returns information local to the
+ connecting Coordinator. To inquire information local to other node,
+ use <command>EXECUTE DIRECT</command>.
+ </para>
+
<sect2>
<title>The <structname>pg_buffercache</structname> View</title>
</para>
<para>
- By default public access is revoked from the functions, just in case
- there are security issues lurking.
+ By default use is restricted to superusers and members of the
+ <literal>pg_stat_scan_tables</literal> role. Access may be granted to others
+ using <command>GRANT</command>.
</para>
+ <para>
+ Functions of this module return information from the Coordinator that the
+ session is currently connected to. To get information about a Datanode, you
+ can use <command>EXECUTE DIRECT</command>.
+ </para>
+
<sect2>
<title>Functions</title>
locking information for a specified table.
</para>
+ <para>
+ By default use is restricted to superusers, members of the
+ <literal>pg_stat_scan_tables</literal> role, and users with
+ <literal>SELECT</literal> permissions on the table.
+ </para>
+
+ <para>
+ Functions of this module return information from the
+ Coordinator that the session is currently connect to.
+ To get information about a Datanode, you can
+ use <command>EXECUTE DIRECT</command>.
+ </para>
<sect2>
<title>Overview</title>
obtain tuple-level statistics.
</para>
+ <para>
+ As these functions return detailed page-level information, only the superuser
+ has EXECUTE privileges on them upon installation. After the functions have
+ been installed, users may issue <command>GRANT</command> commands to change
+ the privileges on the functions to allow non-superusers to execute them. Members
+ of the <literal>pg_stat_scan_tables</literal> role are granted access by default. See
+ the description of the <xref linkend="sql-grant"> command for specifics.
+ </para>
+ <para>
+ Functions of this module return information from the Coordinator that the
+ session is currently connected to. To get information about a Datanode, you
+ can use <command>EXECUTE DIRECT</command>.
+ </para>
<sect2>
<title>Functions</title>
By default, recovery will recover to the end of the WAL log. The
following parameters can be used to specify an earlier stopping point.
At most one of <varname>recovery_target</>,
- <varname>recovery_target_name</>, <varname>recovery_target_time</>,
- <varname>recovery_target_lsn</>, <varname>recovery_target_name</>,
- <varname>recovery_target_time</>, or <varname>recovery_target_xid</>
- can be used; if more than one of these is specified in the configuration
- file, the last entry will be used.
++ <varname>recovery_target_lsn</>, <varname>recovery_target_name</>, <varname>recovery_target_time</>,
+ <varname>recovery_target_xid</> or <varname>recovery_target_barrier</> can be used; if more than one of these
+ is specified in the configuration file, the last entry will be used.
</para>
<variablelist>
</para>
</listitem>
</varlistentry>
+ <varlistentry id="recovery-target-barrier" xreflabel="recovery_target_barrier">
+ <term><varname>recovery_target_barrier</varname> (<type>string</type>)
+ <indexterm>
+ <primary><varname>recovery_target_barrier</> recovery parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ This parameter specifies the barrier ID up to which recovery
+ will proceed. A global consistency is guaranteed when recovery is
+ stopped at a previously successfully completed barrier. At most
+ one of <varname>recovery_target_xid</>,
+ <xref linkend="recovery-target-time"> and
+ <varname>recovery_target_barrier</> can be specified.
++ </varlistentry>
+
+ <varlistentry id="recovery-target-lsn" xreflabel="recovery_target_lsn">
+ <term><varname>recovery_target_lsn</varname> (<type>pg_lsn</type>)
+ <indexterm>
+ <primary><varname>recovery_target_lsn</> recovery parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ This parameter specifies the LSN of the write-ahead log location up
+ to which recovery will proceed. The precise stopping point is also
+ influenced by <xref linkend="recovery-target-inclusive">. This
+ parameter is parsed using the system data type
+ <link linkend="datatype-pg-lsn"><type>pg_lsn</></link>.
</para>
</listitem>
</varlistentry>
<!-- applications and utilities -->
<!ENTITY clusterdb SYSTEM "clusterdb.sgml">
<!ENTITY createdb SYSTEM "createdb.sgml">
- <!ENTITY createlang SYSTEM "createlang.sgml">
<!ENTITY createuser SYSTEM "createuser.sgml">
<!ENTITY dropdb SYSTEM "dropdb.sgml">
- <!ENTITY droplang SYSTEM "droplang.sgml">
<!ENTITY dropuser SYSTEM "dropuser.sgml">
<!ENTITY ecpgRef SYSTEM "ecpg-ref.sgml">
+<!ENTITY gtm system "gtm.sgml">
+<!ENTITY gtmPxy system "gtm_proxy.sgml">
+<!ENTITY gtmCtl system "gtm_ctl.sgml">
<!ENTITY initdb SYSTEM "initdb.sgml">
+<!ENTITY initgtm SYSTEM "initgtm.sgml">
<!ENTITY pgarchivecleanup SYSTEM "pgarchivecleanup.sgml">
<!ENTITY pgBasebackup SYSTEM "pg_basebackup.sgml">
<!ENTITY pgbench SYSTEM "pgbench.sgml">
</varlistentry>
<varlistentry>
+ <term><literal>DISTRIBUTE BY</literal></term>
+ <listitem>
+ <para>
+ This clause specifies how the table is distributed or replicated among Datanodes.
+ </para>
+
+ <variablelist>
+
+ <varlistentry>
+ <term><literal>REPLICATION</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be replicated into all the
+ Datanodes of the <productname>Postgres-XL</> database
+ cluster.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>ROUNDROBIN</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be placed in one of the Datanodes in a
+ round-robin manner. The value of the row will not be needed to
+ determine what Datanode to go.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>HASH ( <replaceable class="PARAMETER">column_name</> )</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be placed based on the hash value
+ of the specified column. Following type is allowed as
+ distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR,
+ OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, NUMERIC, MONEY,
+ ABSTIME, RELTIME, DATE, TIME,TIMESTAMP, TIMESTAMPTZ, INTERVAL, and TIMETZ.
+ </para>
+ <para>
+ Please note that floating point is not allowed as a basis of
+ the distribution column.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>MODULO ( <replaceable class="PARAMETER">column_name</> )</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be placed based on the modulo
+ of the specified column. Following type is allowed as
+ distribution column: INT8, INT2, INT4, BOOL, ABSTIME, RELTIME,
+ DATE.
+ </para>
+ <para>
+ Please note that floating point is not allowed as a basis of
+ the distribution column.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ <term><literal>ATTACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable> FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable></literal></term>
+ <listitem>
+ <para>
+ This form attaches an existing table (which might itself be partitioned)
+ as a partition of the target table using the same syntax for
+ <replaceable class="PARAMETER">partition_bound_spec</replaceable> as
+ <xref linkend="sql-createtable">. The partition bound specification
+ must correspond to the partitioning strategy and partition key of the
+ target table. The table to be attached must have all the same columns
+ as the target table and no more; moreover, the column types must also
+ match. Also, it must have all the <literal>NOT NULL</literal> and
+ <literal>CHECK</literal> constraints of the target table. Currently
+ <literal>UNIQUE</literal>, <literal>PRIMARY KEY</literal>, and
+ <literal>FOREIGN KEY</literal> constraints are not considered.
+ If any of the <literal>CHECK</literal> constraints of the table being
+ attached is marked <literal>NO INHERIT</literal>, the command will fail;
+ such a constraint must be recreated without the <literal>NO INHERIT</literal>
+ clause.
+ </para>
+
+ <para>
+ If the new partition is a regular table, a full table scan is performed
+ to check that no existing row in the table violates the partition
+ constraint. It is possible to avoid this scan by adding a valid
+ <literal>CHECK</literal> constraint to the table that would allow only
+ the rows satisfying the desired partition constraint before running this
+ command. It will be determined using such a constraint that the table
+ need not be scanned to validate the partition constraint. This does not
+ work, however, if any of the partition keys is an expression and the
+ partition does not accept <literal>NULL</literal> values. If attaching
+ a list partition that will not accept <literal>NULL</literal> values,
+ also add <literal>NOT NULL</literal> constraint to the partition key
+ column, unless it's an expression.
+ </para>
+
+ <para>
+ If the new partition is a foreign table, nothing is done to verify
+ that all the rows in the foreign table obey the partition constraint.
+ (See the discussion in <xref linkend="SQL-CREATEFOREIGNTABLE"> about
+ constraints on the foreign table.)
+ </para>
</listitem>
</varlistentry>
-
+ <varlistentry>
+ <term><literal>TO GROUP</literal></term>
+ <term><literal>TO NODE</literal></term>
+ <listitem>
+ <para>
+ This defines the list of nodes on which table data exists.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>ADD NODE</literal></term>
+ <listitem>
+ <para>
+ This adds a list of nodes where data of table is distributed
+ to the existing list. If the list of nodes added contains nodes
+ already used by table, an error is returned.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>DELETE NODE</literal></term>
+ <listitem>
+ <para>
+ This deletes a list of nodes where the data of a table is distributed
+ to the existing list. If the list of nodes deleted contains nodes not
+ used by table, an error is returned.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><literal>DETACH PARTITION</literal> <replaceable class="PARAMETER">partition_name</replaceable></term>
+ <listitem>
+ <para>
+ This form detaches specified partition of the target table. The detached
+ partition continues to exist as a standalone table, but no longer has any
+ ties to the table from which it was detached.
+ </para>
+ </listitem>
+ </varlistentry>
</variablelist>
</para>
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><replaceable class="PARAMETER">nodename</replaceable></term>
+ <listitem>
+ <para>
+ It defines a <productname>Postgres-XL</productname> node of catalog pgxc_node.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><replaceable class="PARAMETER">groupname</replaceable></term>
+ <listitem>
+ <para>
+ It defines a <productname>Postgres-XL</productname> node group in catalog pgxc_group.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><replaceable class="PARAMETER">partition_name</replaceable></term>
+ <listitem>
+ <para>
+ The name of the table to attach as a new partition or to detach from this table.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><replaceable class="PARAMETER">partition_bound_spec</replaceable></term>
+ <listitem>
+ <para>
+ The partition bound specification for a new partition. Refer to
+ <xref linkend="sql-createtable"> for more details on the syntax of the same.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</refsect1>
ADD CONSTRAINT distributors_pkey PRIMARY KEY USING INDEX dist_id_temp_idx;
</programlisting></para>
+ <para>
+ To change the distribution type and the list of nodes where table data
+ is located:
+<programlisting>
+ALTER TABLE distributors TO NODE (dn1, dn7), DISTRIBUTE BY HASH(dist_id);
+</programlisting>
+ </para>
+
+ <para>
+ To add a node where data of table is distributed:
+<programlisting>
+ALTER TABLE distributors ADD NODE (dn9, dn14);
+</programlisting>
+ </para>
+
+ <para>
+ To remove a node where data of table is distributed:
+<programlisting>
+ALTER TABLE distributors DELETE NODE (dn4, dn0);
+</programlisting>
+ </para>
+
+ <para>
+ Attach a partition to range partitioned table:
+ <programlisting>
+ ALTER TABLE measurement
+ ATTACH PARTITION measurement_y2016m07 FOR VALUES FROM ('2016-07-01') TO ('2016-08-01');
+ </programlisting></para>
+
+ <para>
+ Attach a partition to list partitioned table:
+ <programlisting>
+ ALTER TABLE cities
+ ATTACH PARTITION cities_ab FOR VALUES IN ('a', 'b');
+ </programlisting></para>
+
+ <para>
+ Detach a partition from partitioned table:
+ <programlisting>
+ ALTER TABLE cities
+ DETACH PARTITION measurement_y2015m12;
+ </programlisting></para>
+
</refsect1>
<refsect1>
effect can be had using the OID feature.
</para>
</refsect2>
+
+ <refsect2>
+ <title><literal>PARTITION BY</> Clause</title>
+
+ <para>
+ The <literal>PARTITION BY</> clause is a
+ <productname>PostgreSQL</productname> extension.
+ </para>
+ </refsect2>
+
+ <refsect2>
+ <title><literal>PARTITION OF</> Clause</title>
+
+ <para>
+ The <literal>PARTITION OF</> clause is a
+ <productname>PostgreSQL</productname> extension.
+ </para>
+ </refsect2>
+
+ <refsect2>
+ <title><productname>Postgres-XL</> Specifics</title>
+
+ <para>
+ Currently, immutable, stable, volatile functions and nextval are allowed in DEFAULT clause.
+ as <literal>DEFAULT</> values.
+ </para>
+ <para>
+ <literal>PRIMARY KEY</> and foreign key must include the
+ distribution column.
+ </para>
+ <para>
+ <literal>TEMP</> tables and exclusion constraint are not supported
+ yet.
+ </para>
+ <para>
+ </para>
+ <para>
+ In <productname>Postgres-XL</>, OID is maintained locally in each
+ Datanode and Coordinator. The OID value may be inconsistent for rows
+ stored in different Datanodes.
+ </para>
+
+ </refsect2>
</refsect1>
<arg choice="plain"><option>i[mmediate]</option></arg>
</group>
</arg>
+ <arg choice="opt"><option>-W</option></arg>
+ <arg choice="opt"><option>-t</option> <replaceable>seconds</replaceable></arg>
+ <arg choice="opt"><option>-s</option></arg>
<arg choice="opt"><option>-o</option> <replaceable>options</replaceable></arg>
+ <arg>-Z <replaceable>nodeopt</replaceable></arg>
+ <arg choice="opt"><option>-c</option></arg>
</cmdsynopsis>
<cmdsynopsis>
utilities,
also uses the environment variables supported by <application>libpq</>
(see <xref linkend="libpq-envars">).
- For additional server variables, see <xref linkend="app-postgres">.
+ </para>
+
+ <para>
+ For additional variables that affect the server,
+ see <xref linkend="app-postgres">.
</para>
+
+ <para>
+ In <productname>Postgres-XL</>, this command controls individual Coordinator or Datanode.
+ </para>
</refsect1>
</para>
<para>
- In <productname>Postgres-XL</>, <command>pg_resetxlog</command>
+ <command>pg_resetwal</command> works only with servers of the same
+ major version.
+ </para>
++
++ <para>
++ In <productname>Postgres-XL</>, <command>pg_resetwal</command>
+ will only run locally for Coordinators and Datanodes. You should run it
+ for each Coordinator or Datanode manually.
+ </para>
</refsect1>
<refsect1>
The reason for splitting the release notes this way is so that appropriate
subsets can easily be copied into back branches.
-->
+ &release-10;
&release-9.6;
+&release-xl-9.5r1;
&release-9.5;
&release-9.4;
&release-9.3;
<productname>PostgreSQL</> release to a newer one.
</para>
+ <para>
+ Because <productname>Postgres-XL</>'s Coordinators and Datanodes
+ are essentially <productname>PostgreSQL</> servers, you can follw
+ the steps described below to upgrade each of them. Please note
+ that you should do this manually.
+ </para>
+
<para>
- <productname>PostgreSQL</> major versions are represented by the
- first two digit groups of the version number, e.g., 8.4.
- <productname>PostgreSQL</> minor versions are represented by the
- third group of version digits, e.g., 8.4.2 is the second minor
- release of 8.4. Minor releases never change the internal storage
- format and are always compatible with earlier and later minor
- releases of the same major version number, e.g., 8.4.2 is compatible
- with 8.4, 8.4.1 and 8.4.6. To update between compatible versions,
- you simply replace the executables while the server is down and
- restart the server. The data directory remains unchanged —
- minor upgrades are that simple.
+ Current <productname>PostgreSQL</productname> version numbers consist of a
+ major and a minor version number. For example, in the version number 10.1,
+ the 10 is the major version number and the 1 is the minor version number,
+ meaning this would be the first minor release of the major release 10. For
+ releases before <productname>PostgreSQL</productname> version 10.0, version
+ numbers consist of three numbers, for example, 9.5.3. In those cases, the
+ major version consists of the first two digit groups of the version number,
+ e.g., 9.5, and the minor version is the third number, e.g., 3, meaning this
+ would be the third minor release of the major release 9.5.
+ </para>
+
+ <para>
+ Minor releases never change the internal storage format and are always
+ compatible with earlier and later minor releases of the same major version
+ number. For example, version 10.1 is compatible with version 10.0 and
+ version 10.6. Similarly, for example, 9.5.3 is compatible with 9.5.0,
+ 9.5.1, and 9.5.6. To update between compatible versions, you simply
+ replace the executables while the server is down and restart the server.
+ The data directory remains unchanged — minor upgrades are that
+ simple.
</para>
<para>
<para>
The Git mirror can also be reached via the HTTP protocol, if for example
a firewall is blocking access to the Git protocol. Just change the URL
- prefix to <literal>http</>, as in:
+ prefix to <literal>https</>, as in:
<programlisting>
- git clone http://git.postgresql.org/git/postgres-xl.git
-git clone https://git.postgresql.org/git/postgresql.git
++git clone https://git.postgresql.org/git/postgres-xl.git
</programlisting>
The HTTP protocol is less efficient than the Git protocol, so it will be
backend/utils/mb/conversion_procs \
backend/snowball \
include \
- interfaces \
backend/replication/libpqwalreceiver \
+ backend/replication/pgoutput \
fe_utils \
bin \
pl \
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
+ifneq ($(PORTNAME), win32)
+override CFLAGS += $(PTHREAD_CFLAGS)
+endif
+
SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \
- main nodes optimizer port postmaster regex replication rewrite \
- statistics storage tcop tsearch utils $(top_builddir)/src/timezone
+ pgxc main nodes optimizer port postmaster regex replication rewrite \
- storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
++ statistics storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
include $(srcdir)/common.mk
endif
endif
-OBJS = $(SUBDIROBJS) $(LOCALOBJS) $(top_builddir)/src/port/libpgport_srv.a \
- $(top_builddir)/src/common/libpgcommon_srv.a
+OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
+ $(top_builddir)/src/port/libpgport_srv.a \
+ $(top_builddir)/src/common/libpgcommon_srv.a \
+ $(top_builddir)/src/interfaces/libpq/fe-connect.o \
+ $(top_builddir)/src/interfaces/libpq/fe-secure.o \
+ $(top_builddir)/src/interfaces/libpq/fe-misc.o \
+ $(top_builddir)/src/interfaces/libpq/fe-protocol3.o \
+ $(top_builddir)/src/interfaces/libpq/fe-protocol2.o \
+ $(top_builddir)/src/interfaces/libpq/fe-exec.o \
+ $(top_builddir)/src/interfaces/libpq/fe-auth.o \
+ $(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \
++ $(top_builddir)/src/interfaces/libpq/fe-auth-scram.o \
+ $(top_builddir)/src/gtm/client/libgtmclient.a \
+ $(top_builddir)/src/gtm/common/libgtm.a \
+ $(top_builddir)/src/gtm/libpq/libpqcomm.a
+
+ifeq ($(with_openssl), yes)
+OBJS += $(top_builddir)/src/interfaces/libpq/fe-secure-openssl.o
+endif
# We put libpgport and libpgcommon into OBJS, so remove it from LIBS; also add
# libldap
ifneq ($(PORTNAME), aix)
postgres: $(OBJS)
- $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) -o $@
- $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@
++ $(CC) $(CFLAGS) $(LDFLAGS) $(LDFLAGS_EX) -L$(top_builddir)/src/gtm/libpg $(export_dynamic) $(call expand_subsys,$^) $(LIBS) $(ICU_LIBS) -o $@
endif
endif
* and we'd like to still refer to them via C struct offsets.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* clients and standalone backends are supported here).
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
#include "postgres.h"
#include "access/hash.h"
+ #include "utils/builtins.h"
+#ifdef PGXC
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/timestamp.h"
+#include "utils/date.h"
+#include "utils/nabstime.h"
+#endif
+
+ /*
+ * Datatype-specific hash functions.
+ *
+ * These support both hash indexes and hash joins.
+ *
+ * NOTE: some of these are also used by catcache operations, without
+ * any direct connection to hash indexes. Also, the common hash_any
+ * routine is also used by dynahash tables.
+ */
+
/* Note: this is used for both "char" and boolean datatypes */
Datum
hashchar(PG_FUNCTION_ARGS)
/* report the result */
return UInt32GetDatum(c);
}
- case INT2VECTOROID:
- return DirectFunctionCall1(hashint2vector, value);
+
+#ifdef PGXC
+/*
+ * compute_hash()
+ * Generic hash function for all datatypes
+ */
+Datum
+compute_hash(Oid type, Datum value, char locator)
+{
+ int16 tmp16;
+ int32 tmp32;
+ int64 tmp64;
+ Oid tmpoid;
+ char tmpch;
+
+ switch (type)
+ {
+ case INT8OID:
+ /* This gives added advantage that
+ * a = 8446744073709551359
+ * and a = 8446744073709551359::int8 both work*/
+ tmp64 = DatumGetInt64(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashint8, value);
+ return tmp64;
+ case INT2OID:
+ tmp16 = DatumGetInt16(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashint2, tmp16);
+ return tmp16;
+ case OIDOID:
+ tmpoid = DatumGetObjectId(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashoid, tmpoid);
+ return tmpoid;
+ case INT4OID:
+ tmp32 = DatumGetInt32(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashint4, tmp32);
+ return tmp32;
+ case BOOLOID:
+ tmpch = DatumGetBool(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashchar, tmpch);
+ return tmpch;
+
+ case CHAROID:
+ return DirectFunctionCall1(hashchar, value);
+ case NAMEOID:
+ return DirectFunctionCall1(hashname, value);
- case INT2VECTOROID:
- return "hashint2vector";
+
+ case VARCHAROID:
+ case TEXTOID:
+ return DirectFunctionCall1(hashtext, value);
+
+ case OIDVECTOROID:
+ return DirectFunctionCall1(hashoidvector, value);
+ case FLOAT4OID:
+ return DirectFunctionCall1(hashfloat4, value);
+ case FLOAT8OID:
+ return DirectFunctionCall1(hashfloat8, value);
+
+ case ABSTIMEOID:
+ tmp32 = DatumGetAbsoluteTime(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashint4, tmp32);
+ return tmp32;
+ case RELTIMEOID:
+ tmp32 = DatumGetRelativeTime(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashint4, tmp32);
+ return tmp32;
+ case CASHOID:
+ return DirectFunctionCall1(hashint8, value);
+
+ case BPCHAROID:
+ return DirectFunctionCall1(hashbpchar, value);
+ case BYTEAOID:
+ return DirectFunctionCall1(hashvarlena, value);
+
+ case DATEOID:
+ tmp32 = DatumGetDateADT(value);
+ if (locator == LOCATOR_TYPE_HASH)
+ return DirectFunctionCall1(hashint4, tmp32);
+ return tmp32;
+ case TIMEOID:
+ return DirectFunctionCall1(time_hash, value);
+ case TIMESTAMPOID:
+ return DirectFunctionCall1(timestamp_hash, value);
+ case TIMESTAMPTZOID:
+ return DirectFunctionCall1(timestamp_hash, value);
+ case INTERVALOID:
+ return DirectFunctionCall1(interval_hash, value);
+ case TIMETZOID:
+ return DirectFunctionCall1(timetz_hash, value);
+
+ case NUMERICOID:
+ return DirectFunctionCall1(hash_numeric, value);
+ default:
+ ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
+ }
+ /* Control should not come here. */
+ ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
+ /* Keep compiler silent */
+ return (Datum)0;
+}
+
+
+/*
+ * get_compute_hash_function
+ * Get hash function name depending on the hash type.
+ * For some cases of hash or modulo distribution, a function might
+ * be required or not.
+ */
+char *
+get_compute_hash_function(Oid type, char locator)
+{
+ switch (type)
+ {
+ case INT8OID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint8";
+ return NULL;
+ case INT2OID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint2";
+ return NULL;
+ case OIDOID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashoid";
+ return NULL;
+ case DATEOID:
+ case INT4OID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint4";
+ return NULL;
+ case BOOLOID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashchar";
+ return NULL;
+ case CHAROID:
+ return "hashchar";
+ case NAMEOID:
+ return "hashname";
+ case VARCHAROID:
+ case TEXTOID:
+ return "hashtext";
+ case OIDVECTOROID:
+ return "hashoidvector";
+ case FLOAT4OID:
+ return "hashfloat4";
+ case FLOAT8OID:
+ return "hashfloat8";
+ case RELTIMEOID:
+ case ABSTIMEOID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint4";
+ return NULL;
+ case CASHOID:
+ return "hashint8";
+ case BPCHAROID:
+ return "hashbpchar";
+ case BYTEAOID:
+ return "hashvarlena";
+ case TIMEOID:
+ return "time_hash";
+ case TIMESTAMPOID:
+ case TIMESTAMPTZOID:
+ return "timestamp_hash";
+ case INTERVALOID:
+ return "interval_hash";
+ case TIMETZOID:
+ return "timetz_hash";
+ case NUMERICOID:
+ return "hash_numeric";
+ default:
+ ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
+ }
+
+ /* Keep compiler quiet */
+ return NULL;
+}
+#endif
* for aborts (whether sync or async), since the post-crash assumption would
* be that such transactions failed anyway.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/backend/access/transam/clog.c
*
#include "libpq/pqmq.h"
#include "miscadmin.h"
#include "optimizer/planmain.h"
+ #include "pgstat.h"
+#include "pgxc/pgxcnode.h"
#include "storage/ipc.h"
#include "storage/sinval.h"
#include "storage/spin.h"
# must set a recovery target.
#
# You may set a recovery target either by transactionId, by name,
- # or by timestamp or by barrier. Recovery may either include or exclude the
- # transaction(s) with the recovery target value (ie, stop either
- # just after or just before the given target, respectively). In case of
- # barrier, the recovery stops exactly at that point.
-# by timestamp or by WAL location (LSN). Recovery may either include or
-# exclude the transaction(s) with the recovery target value (ie, stop either
-# just after or just before the given target, respectively).
++# or by timestamp or by WAL location (LSN) or by barrier. Recovery may either
++# include or exclude the transaction(s) with the recovery target value (ie,
++# stop either just after or just before the given target, respectively). In
++# case of barrier, the recovery stops exactly at that point.
#
#
#recovery_target_name = '' # e.g. 'daily backup 2011-01-26'
#
#recovery_target_xid = ''
#
+#recovery_target_barrier = ''
+#
+ #recovery_target_lsn = '' # e.g. '0/70006B8'
+ #
#recovery_target_inclusive = true
#
#
* data across crashes. During database startup, we simply force the
* currently-active page of SUBTRANS to zeroes.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/backend/access/transam/subtrans.c
*
* twophase.c
* Two-phase commit support functions.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* IDENTIFICATION
* src/backend/access/transam/twophase.c
* varsup.c
* postgres OID & XID variables support routines
*
- * Copyright (c) 2000-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ * Copyright (c) 2000-2017, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/access/transam/varsup.c
*
* See src/backend/access/transam/README for more information.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
static void AtSubStart_Memory(void);
static void AtSubStart_ResourceOwner(void);
+#ifdef XCP
+static void AtSubCommit_WaitedXids(void);
+static void AtSubAbort_WaitedXids(void);
+static void AtEOXact_WaitedXids(void);
+static void TransactionRecordXidWait_Internal(TransactionState s,
+ TransactionId xid);
+#endif
+
static void ShowTransactionState(const char *str);
- static void ShowTransactionStateRec(TransactionState state);
+ static void ShowTransactionStateRec(const char *str, TransactionState state);
static const char *BlockStateAsString(TBlockState blockState);
static const char *TransStateAsString(TransState state);
+static void PrepareTransaction(void);
+static void AtEOXact_GlobalTxn(bool commit);
/* ----------------------------------------------------------------
{
s->startedInRecovery = false;
XactReadOnly = DefaultXactReadOnly;
+#ifdef PGXC
+ /* Save Postgres-XC session as read-only if necessary */
+ XactReadOnly |= IsPGXCNodeXactReadOnly();
+#endif
}
XactDeferrable = DefaultXactDeferrable;
+#ifdef PGXC
+ /* PGXCTODO - PGXC doesn't support 9.1 serializable transactions. They are
+ * silently turned into repeatable-reads which is same as pre 9.1
+ * serializable isolation level
+ */
+ if (DefaultXactIsoLevel == XACT_SERIALIZABLE)
+ DefaultXactIsoLevel = XACT_REPEATABLE_READ;
+#endif
XactIsoLevel = DefaultXactIsoLevel;
forceSyncCommit = false;
- MyXactAccessedTempRel = false;
+ XactLocalNodePrepared = false;
+ MyXactFlags = 0;
/*
* reinitialize within-transaction counters
if (!is_parallel_worker)
{
/*
- * We need to mark our XIDs as committed in pg_clog. This is where we
+ * We need to mark our XIDs as committed in pg_xact. This is where we
* durably commit.
*/
- latestXid = RecordTransactionCommit();
+#ifdef XCP
+ latestXid = InvalidTransactionId;
+ if (!IsConnFromDatanode())
+#endif
+ latestXid = RecordTransactionCommit();
}
else
{
#include "catalog/pg_database.h"
#include "commands/tablespace.h"
#include "miscadmin.h"
+#ifdef PGXC
+#include "pgxc/barrier.h"
+#endif
#include "pgstat.h"
+ #include "port/atomics.h"
#include "postmaster/bgwriter.h"
#include "postmaster/walwriter.h"
#include "postmaster/startup.h"
static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
+static char *recoveryTargetBarrierId;
static char *recoveryTargetName;
+ static XLogRecPtr recoveryTargetLSN;
static int recovery_min_apply_delay = 0;
static TimestampTz recoveryDelayUntilTime;
recoveryStopAfter = false;
recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ return true;
+ }
+
+ /* Check if target LSN has been reached */
+ if (recoveryTarget == RECOVERY_TARGET_LSN &&
+ !recoveryTargetInclusive &&
+ record->ReadRecPtr >= recoveryTargetLSN)
+ {
+ recoveryStopAfter = false;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = record->ReadRecPtr;
recoveryStopTime = 0;
recoveryStopName[0] = '\0';
+ ereport(LOG,
+ (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
+ (uint32) (recoveryStopLSN >> 32),
+ (uint32) recoveryStopLSN)));
return true;
}
-
+#ifdef PGXC
+ /* Otherwise we only consider stopping before COMMIT, ABORT or BARRIER records. */
+ if ((XLogRecGetRmid(record) != RM_XACT_ID) && (XLogRecGetRmid(record) != RM_BARRIER_ID))
+#else
/* Otherwise we only consider stopping before COMMIT or ABORT records. */
if (XLogRecGetRmid(record) != RM_XACT_ID)
+#endif
return false;
xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
#include <unistd.h>
+#include "miscadmin.h"
+ #include "access/timeline.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogutils.h"
* routines to support running postgres in 'bootstrap' mode
* bootstrap mode is used to create the initial template database
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* IDENTIFICATION
* src/backend/bootstrap/bootstrap.c
include $(top_builddir)/src/Makefile.global
OBJS = catalog.o dependency.o heap.o index.o indexing.o namespace.o aclchk.o \
- objectaccess.o objectaddress.o pg_aggregate.o pg_collation.o \
+ objectaccess.o objectaddress.o partition.o pg_aggregate.o pg_collation.o \
pg_constraint.o pg_conversion.o \
pg_depend.o pg_enum.o pg_inherits.o pg_largeobject.o pg_namespace.o \
- pg_operator.o pg_proc.o pg_range.o pg_db_role_setting.o pg_shdepend.o \
- pg_type.o pgxc_class.o storage.o toasting.o
+ pg_operator.o pg_proc.o pg_publication.o pg_range.o \
+ pg_db_role_setting.o pg_shdepend.o pg_subscription.o pg_type.o \
- storage.o toasting.o
++ pgxc_class.o storage.o toasting.o
BKIFILES = postgres.bki postgres.description postgres.shdescription
pg_ts_config.h pg_ts_config_map.h pg_ts_dict.h \
pg_ts_parser.h pg_ts_template.h pg_extension.h \
pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \
+ pgxc_class.h pgxc_node.h pgxc_group.h \
pg_foreign_table.h pg_policy.h pg_replication_origin.h \
pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \
- pg_collation.h pg_range.h pg_transform.h \
+ pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \
+ pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \
+ pg_subscription_rel.h toasting.h indexing.h \
toasting.h indexing.h \
)
* bits of hard-wired knowledge
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "catalog/pg_shdepend.h"
#include "catalog/pg_shdescription.h"
#include "catalog/pg_shseclabel.h"
+ #include "catalog/pg_subscription.h"
#include "catalog/pg_tablespace.h"
#include "catalog/toasting.h"
+#include "catalog/pgxc_node.h"
+#include "catalog/pgxc_group.h"
#include "miscadmin.h"
#include "storage/fd.h"
#include "utils/fmgroids.h"
relationId == SharedDependRelationId ||
relationId == SharedSecLabelRelationId ||
relationId == TableSpaceRelationId ||
+#ifdef PGXC
+ relationId == PgxcGroupRelationId ||
+ relationId == PgxcNodeRelationId ||
+#endif
relationId == DbRoleSettingRelationId ||
- relationId == ReplicationOriginRelationId)
+ relationId == ReplicationOriginRelationId ||
+ relationId == SubscriptionRelationId)
return true;
/* These are their indexes (see indexing.h) */
if (relationId == AuthIdRolnameIndexId ||
relationId == SharedSecLabelObjectIndexId ||
relationId == TablespaceOidIndexId ||
relationId == TablespaceNameIndexId ||
+#ifdef PGXC
+ relationId == PgxcNodeNodeNameIndexId ||
+ relationId == PgxcNodeNodeIdIndexId ||
+ relationId == PgxcNodeOidIndexId ||
+ relationId == PgxcGroupGroupNameIndexId ||
+ relationId == PgxcGroupOidIndexId ||
+#endif
relationId == DbRoleSettingDatidRolidIndexId ||
relationId == ReplicationOriginIdentIndex ||
- relationId == ReplicationOriginNameIndex)
+ relationId == ReplicationOriginNameIndex ||
+ relationId == SubscriptionObjectIndexId ||
+ relationId == SubscriptionNameIndexId)
return true;
/* These are their toast tables and toast indexes (see toasting.h) */
if (relationId == PgShdescriptionToastTable ||
* Routines to support inter-object dependencies.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* IDENTIFICATION
* src/backend/catalog/dependency.c
UserMappingRelationId, /* OCLASS_USER_MAPPING */
DefaultAclRelationId, /* OCLASS_DEFACL */
ExtensionRelationId, /* OCLASS_EXTENSION */
+#ifdef PGXC
+ PgxcClassRelationId, /* OCLASS_PGXCCLASS */
+ PgxcNodeRelationId, /* OCLASS_PGXC_NODE */
+ PgxcGroupRelationId, /* OCLASS_PGXC_GROUP */
+#endif
EventTriggerRelationId, /* OCLASS_EVENT_TRIGGER */
PolicyRelationId, /* OCLASS_POLICY */
+ PublicationRelationId, /* OCLASS_PUBLICATION */
+ PublicationRelRelationId, /* OCLASS_PUBLICATION_REL */
+ SubscriptionRelationId, /* OCLASS_SUBSCRIPTION */
TransformRelationId /* OCLASS_TRANSFORM */
};
-
static void findDependentObjects(const ObjectAddress *object,
+ int objflags,
int flags,
ObjectAddressStack *stack,
ObjectAddresses *targetObjects,
heap_close(depRel, RowExclusiveLock);
}
- /*
- * deleteWhatDependsOn: attempt to drop everything that depends on the
- * specified object, though not the object itself. Behavior is always
- * CASCADE.
- *
- * This is currently used only to clean out the contents of a schema
- * (namespace): the passed object is a namespace. We normally want this
- * to be done silently, so there's an option to suppress NOTICE messages.
- *
- * Note we don't fire object drop event triggers here; it would be wrong to do
- * so for the current only use of this function, but if more callers are added
- * this might need to be reconsidered.
- */
- void
- deleteWhatDependsOn(const ObjectAddress *object,
- bool showNotices)
- {
- Relation depRel;
- ObjectAddresses *targetObjects;
- int i;
-
- /*
- * We save some cycles by opening pg_depend just once and passing the
- * Relation pointer down to all the recursive deletion steps.
- */
- depRel = heap_open(DependRelationId, RowExclusiveLock);
-
- /*
- * Acquire deletion lock on the target object. (Ideally the caller has
- * done this already, but many places are sloppy about it.)
- */
- AcquireDeletionLock(object, 0);
-
- /*
- * Construct a list of objects to delete (ie, the given object plus
- * everything directly or indirectly dependent on it).
- */
- targetObjects = new_object_addresses();
-
- findDependentObjects(object,
- DEPFLAG_ORIGINAL,
- NULL, /* empty stack */
- targetObjects,
- NULL, /* no pendingObjects */
- &depRel);
-
- /*
- * Check if deletion is allowed, and report about cascaded deletes.
- */
- reportDependentObjects(targetObjects,
- DROP_CASCADE,
- showNotices ? NOTICE : DEBUG2,
- object);
-
- /*
- * Delete all the objects in the proper order, except we skip the original
- * object.
- */
- for (i = 0; i < targetObjects->numrefs; i++)
- {
- ObjectAddress *thisobj = targetObjects->refs + i;
- ObjectAddressExtra *thisextra = targetObjects->extras + i;
-
- if (thisextra->flags & DEPFLAG_ORIGINAL)
- continue;
-
- /*
- * Since this function is currently only used to clean out temporary
- * schemas, we pass PERFORM_DELETION_INTERNAL here, indicating that
- * the operation is an automatic system operation rather than a user
- * action. If, in the future, this function is used for other
- * purposes, we might need to revisit this.
- */
- deleteOneObject(thisobj, &depRel, PERFORM_DELETION_INTERNAL);
- }
-
- /* And clean up */
- free_object_addresses(targetObjects);
-
- heap_close(depRel, RowExclusiveLock);
- }
-
+#ifdef PGXC
+/*
+ * Check type and class of the given object and rename it properly on GTM
+ */
+static void
+doRename(const ObjectAddress *object, const char *oldname, const char *newname)
+{
+ switch (getObjectClass(object))
+ {
+ case OCLASS_CLASS:
+ {
+ char relKind = get_rel_relkind(object->objectId);
+
+ /*
+ * If we are here, a schema is being renamed, a sequence depends on it.
+ * as sequences' global name use the schema name, this sequence
+ * has also to be renamed on GTM.
+ * An operation with GTM can just be done from a remote Coordinator.
+ */
+ if (relKind == RELKIND_SEQUENCE &&
+ IS_PGXC_LOCAL_COORDINATOR)
+ {
+ Relation relseq = relation_open(object->objectId, AccessShareLock);
+ char *seqname = GetGlobalSeqName(relseq, NULL, oldname);
+ char *newseqname = GetGlobalSeqName(relseq, NULL, newname);
+
+ /* We also need to rename this sequence on GTM, it has a global name ! */
+ if (RenameSequenceGTM(seqname, newseqname) < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not rename sequence")));
+
+
+ pfree(seqname);
+ pfree(newseqname);
+
+ relation_close(relseq, AccessShareLock);
+ }
+ }
+ default:
+ /* Nothing to do, this object has not to be renamed, end of the story... */
+ break;
+ }
+}
+
+/*
+ * performRename: used to rename objects
+ * on GTM depending on another object(s)
+ */
+void
+performRename(const ObjectAddress *object, const char *oldname, const char *newname)
+{
+ Relation depRel;
+ ObjectAddresses *targetObjects;
+ int i;
+
+ /*
+ * Check the dependencies on this object
+ * And rename object dependent if necessary
+ */
+
+ depRel = heap_open(DependRelationId, RowExclusiveLock);
+
+ targetObjects = new_object_addresses();
+
+ findDependentObjects(object,
+ DEPFLAG_ORIGINAL,
++ 0, /* XXX seems like flags are only used while
++ dropping objects */
+ NULL, /* empty stack */
+ targetObjects,
+ NULL,
+ &depRel);
+
+ /* Check Objects one by one to see if some of them have to be renamed on GTM */
+ for (i = 0; i < targetObjects->numrefs; i++)
+ {
+ ObjectAddress *thisobj = targetObjects->refs + i;
+ doRename(thisobj, oldname, newname);
+ }
+
+ /* And clean up */
+ free_object_addresses(targetObjects);
+
+ heap_close(depRel, RowExclusiveLock);
+}
+#endif
+
/*
* findDependentObjects - find all objects that depend on 'object'
*
heap_drop_with_catalog(object->objectId);
}
+ /*
+ * for a sequence, in addition to dropping the heap, also
+ * delete pg_sequence tuple
+ */
+ if (relKind == RELKIND_SEQUENCE)
+ DeleteSequenceTuple(object->objectId);
+#ifdef PGXC
+ /*
+ * Do not do extra process if this session is connected to a remote
+ * Coordinator.
+ */
+ if (IsConnFromCoord())
+ break;
+
+ /*
+ * This session is connected directly to application, so extra
+ * process related to remote nodes and GTM is needed.
+ */
+ switch (relKind)
+ {
+ case RELKIND_SEQUENCE:
+ /*
+ * Drop the sequence on GTM.
+ * Sequence is dropped on GTM by a remote Coordinator only
+ * for a non temporary sequence.
+ */
+ {
+ /*
+ * The sequence has already been removed from Coordinator,
+ * finish the stuff on GTM too
+ */
+
+ Relation relseq;
+ char *seqname;
+ /*
+ * A relation is opened to get the schema and database name as
+ * such data is not available before when dropping a function.
+ */
+ relseq = relation_open(object->objectId, AccessShareLock);
+ seqname = GetGlobalSeqName(relseq, NULL, NULL);
+ DropSequenceGTM(seqname, GTM_SEQ_FULL_NAME);
+ pfree(seqname);
+
+ /* Then close the relation opened previously */
+ relation_close(relseq, AccessShareLock);
+ }
+ break;
+ case RELKIND_RELATION:
+ case RELKIND_VIEW:
+ break;
+ default:
+ break;
+ }
+#endif /* PGXC */
break;
}
DropTransformById(object->objectId);
break;
- default:
- elog(ERROR, "unrecognized object class: %u",
- object->classId);
+ /*
+ * These global object types are not supported here.
+ */
+ case OCLASS_ROLE:
+ case OCLASS_DATABASE:
+ case OCLASS_TBLSPACE:
+ case OCLASS_SUBSCRIPTION:
++ case OCLASS_PGXC_NODE:
++ case OCLASS_PGXC_GROUP:
+ elog(ERROR, "global objects cannot be deleted by doDeletion");
+ break;
+
+ /*
+ * There's intentionally no default: case here; we want the
+ * compiler to warn if a new OCLASS hasn't been handled above.
+ */
++
}
}
* heap.c
* code to create and destroy POSTGRES heap relations
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
static FormData_pg_attribute a7 = {
0, {"tableoid"}, OIDOID, 0, sizeof(Oid),
TableOidAttributeNumber, 0, -1, -1,
- true, 'p', 'i', true, false, false, true, 0
+ true, 'p', 'i', true, false, '\0', false, true, 0
};
+#ifdef PGXC
+/*
+ * In XC we need some sort of node identification for each tuple
+ * We are adding another system column that would serve as node identifier.
+ * This is not only required by WHERE CURRENT OF but it can be used any
+ * where we want to know the originating Datanode of a tuple received
+ * at the Coordinator
+ */
+static FormData_pg_attribute a8 = {
+ 0, {"xc_node_id"}, INT4OID, 0, sizeof(int32),
+ XC_NodeIdAttributeNumber, 0, -1, -1,
+ true, 'p', 'i', true, false, false, true, 0
+};
+
+static const Form_pg_attribute SysAtt[] = {&a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8};
+#else
static const Form_pg_attribute SysAtt[] = {&a1, &a2, &a3, &a4, &a5, &a6, &a7};
+#endif
/*
* This function returns a Form_pg_attribute pointer for a system attribute.
* and implementing search-path-controlled searches.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
static void NamespaceCallback(Datum arg, int cacheid, uint32 hashvalue);
static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames,
int **argnumbers);
+#ifdef XCP
+static void FindTemporaryNamespace(void);
+#endif
- /* These don't really need to appear in any header file */
- Datum pg_table_is_visible(PG_FUNCTION_ARGS);
- Datum pg_type_is_visible(PG_FUNCTION_ARGS);
- Datum pg_function_is_visible(PG_FUNCTION_ARGS);
- Datum pg_operator_is_visible(PG_FUNCTION_ARGS);
- Datum pg_opclass_is_visible(PG_FUNCTION_ARGS);
- Datum pg_opfamily_is_visible(PG_FUNCTION_ARGS);
- Datum pg_collation_is_visible(PG_FUNCTION_ARGS);
- Datum pg_conversion_is_visible(PG_FUNCTION_ARGS);
- Datum pg_ts_parser_is_visible(PG_FUNCTION_ARGS);
- Datum pg_ts_dict_is_visible(PG_FUNCTION_ARGS);
- Datum pg_ts_template_is_visible(PG_FUNCTION_ARGS);
- Datum pg_ts_config_is_visible(PG_FUNCTION_ARGS);
- Datum pg_my_temp_schema(PG_FUNCTION_ARGS);
- Datum pg_is_other_temp_schema(PG_FUNCTION_ARGS);
-
/*
* RangeVarGetRelid
break;
}
- default:
- appendStringInfo(&buffer, "unrecognized object %u %u %d",
- object->classId,
- object->objectId,
- object->objectSubId);
- break;
+ case OCLASS_SUBSCRIPTION:
+ {
+ appendStringInfo(&buffer, _("subscription %s"),
+ get_subscription_name(object->objectId));
+ break;
+ }
+
+ case OCLASS_TRANSFORM:
+ {
+ HeapTuple trfTup;
+ Form_pg_transform trfForm;
+
+ trfTup = SearchSysCache1(TRFOID,
+ ObjectIdGetDatum(object->objectId));
+ if (!HeapTupleIsValid(trfTup))
+ elog(ERROR, "could not find tuple for transform %u",
+ object->objectId);
+
+ trfForm = (Form_pg_transform) GETSTRUCT(trfTup);
+
+ appendStringInfo(&buffer, _("transform for %s language %s"),
+ format_type_be(trfForm->trftype),
+ get_language_name(trfForm->trflang, false));
+
+ ReleaseSysCache(trfTup);
+ break;
+ }
+
++ case OCLASS_PGXC_NODE:
++ {
++ appendStringInfo(&buffer, _("node %s"),
++ get_pgxc_nodename(object->objectId));
++ break;
++ }
++
++ case OCLASS_PGXC_GROUP:
++ {
++ appendStringInfo(&buffer, _("node group %s"),
++ get_pgxc_groupname(object->objectId));
++ break;
++ }
++
+ /*
+ * There's intentionally no default: case here; we want the
+ * compiler to warn if a new OCLASS hasn't been handled above.
+ */
}
return buffer.data;
appendStringInfoString(&buffer, "transform");
break;
- case OCLASS_AM:
- appendStringInfoString(&buffer, "access method");
++ case OCLASS_PGXC_CLASS:
++ appendStringInfoString(&buffer, "pgxc_class");
+ break;
+
- default:
- appendStringInfo(&buffer, "unrecognized %u", object->classId);
++ case OCLASS_PGXC_NODE:
++ appendStringInfoString(&buffer, "node");
++ break;
++
++ case OCLASS_PGXC_GROUP:
++ appendStringInfoString(&buffer, "node group");
+ break;
++
+ /*
+ * There's intentionally no default: case here; we want the
+ * compiler to warn if a new OCLASS hasn't been handled above.
+ */
}
return buffer.data;
heap_close(transformDesc, AccessShareLock);
}
break;
- case OCLASS_AM:
++
++ case OCLASS_PGXC_CLASS:
++ /*
++ * XXX PG10MERGE: ISTM that we don't record dependencies on
++ * pgxc_class, pgxc_node and pgxc_group. So it's not clear if we
++ * really need corresponding OCLASS_* either. We should check this
++ * in more detail.
++ */
++ break;
+
- char *amname;
++ case OCLASS_PGXC_NODE:
+ {
- amname = get_am_name(object->objectId);
- if (!amname)
- elog(ERROR, "cache lookup failed for access method %u",
- object->objectId);
- appendStringInfoString(&buffer, quote_identifier(amname));
++ char *nodename;
+
- *objname = list_make1(amname);
++ nodename = get_pgxc_nodename(object->objectId);
+ if (objname)
- break;
++ *objname = list_make1(nodename);
++ appendStringInfoString(&buffer,
++ quote_identifier(nodename));
++ break;
+ }
- default:
- appendStringInfo(&buffer, "unrecognized object %u %u %d",
- object->classId,
- object->objectId,
- object->objectSubId);
- break;
+
++ case OCLASS_PGXC_GROUP:
++ {
++ char *groupname;
++
++ groupname = get_pgxc_groupname(object->objectId);
++ if (objname)
++ *objname = list_make1(groupname);
++ appendStringInfoString(&buffer,
++ quote_identifier(groupname));
++ break;
++ }
+
+ /*
+ * There's intentionally no default: case here; we want the
+ * compiler to warn if a new OCLASS hasn't been handled above.
+ */
}
/*
* pg_proc.c
* routines to support manipulation of the pg_proc relation
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
+ #include "utils/regproc.h"
#include "utils/rel.h"
#include "utils/syscache.h"
+#ifdef PGXC
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/planner.h"
+#endif
- Datum fmgr_internal_validator(PG_FUNCTION_ARGS);
- Datum fmgr_c_validator(PG_FUNCTION_ARGS);
- Datum fmgr_sql_validator(PG_FUNCTION_ARGS);
-
typedef struct
{
char *proname;
querytree_list = NIL;
foreach(lc, raw_parsetree_list)
{
- Node *parsetree = (Node *) lfirst(lc);
+ RawStmt *parsetree = lfirst_node(RawStmt, lc);
List *querytree_sublist;
+#ifdef PGXC
+ /* Block CTAS in SQL functions */
+ if (IsA(parsetree, CreateTableAsStmt))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("In XC, SQL functions cannot contain utility statements")));
+#endif
+
querytree_sublist = pg_analyze_and_rewrite_params(parsetree,
prosrc,
(ParserSetupHook) sql_fn_parser_setup,
--- /dev/null
- (void) simple_heap_insert(pgxcclassrel, htup);
-
- CatalogUpdateIndexes(pgxcclassrel, htup);
+/*-------------------------------------------------------------------------
+ *
+ * pgxc_class.c
+ * routines to support manipulation of the pgxc_class relation
+ *
+ * Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_class.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "pgxc/locator.h"
+#include "utils/array.h"
+
+/*
+ * PgxcClassCreate
+ * Create a pgxc_class entry
+ */
+void
+PgxcClassCreate(Oid pcrelid,
+ char pclocatortype,
+ int pcattnum,
+ int pchashalgorithm,
+ int pchashbuckets,
+ int numnodes,
+ Oid *nodes)
+{
+ Relation pgxcclassrel;
+ HeapTuple htup;
+ bool nulls[Natts_pgxc_class];
+ Datum values[Natts_pgxc_class];
+ int i;
+ oidvector *nodes_array;
+
+ /* Build array of Oids to be inserted */
+ nodes_array = buildoidvector(nodes, numnodes);
+
+ /* Iterate through attributes initializing nulls and values */
+ for (i = 0; i < Natts_pgxc_class; i++)
+ {
+ nulls[i] = false;
+ values[i] = (Datum) 0;
+ }
+
+ /* should not happen */
+ if (pcrelid == InvalidOid)
+ {
+ elog(ERROR,"pgxc class relid invalid.");
+ return;
+ }
+
+ values[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid);
+ values[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype);
+
+ if (pclocatortype == LOCATOR_TYPE_HASH || pclocatortype == LOCATOR_TYPE_MODULO)
+ {
+ values[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum);
+ values[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm);
+ values[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets);
+ }
+
+ /* Node information */
+ values[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array);
+
+ /* Open the relation for insertion */
+ pgxcclassrel = heap_open(PgxcClassRelationId, RowExclusiveLock);
+
+ htup = heap_form_tuple(pgxcclassrel->rd_att, values, nulls);
+
- simple_heap_update(rel, &oldtup->t_self, newtup);
- CatalogUpdateIndexes(rel, newtup);
++ CatalogTupleInsert(pgxcclassrel, htup);
+
+ heap_close(pgxcclassrel, RowExclusiveLock);
+}
+
+
+/*
+ * PgxcClassAlter
+ * Modify a pgxc_class entry with given data
+ */
+void
+PgxcClassAlter(Oid pcrelid,
+ char pclocatortype,
+ int pcattnum,
+ int pchashalgorithm,
+ int pchashbuckets,
+ int numnodes,
+ Oid *nodes,
+ PgxcClassAlterType type)
+{
+ Relation rel;
+ HeapTuple oldtup, newtup;
+ oidvector *nodes_array;
+ Datum new_record[Natts_pgxc_class];
+ bool new_record_nulls[Natts_pgxc_class];
+ bool new_record_repl[Natts_pgxc_class];
+
+ Assert(OidIsValid(pcrelid));
+
+ rel = heap_open(PgxcClassRelationId, RowExclusiveLock);
+ oldtup = SearchSysCacheCopy1(PGXCCLASSRELID,
+ ObjectIdGetDatum(pcrelid));
+
+ if (!HeapTupleIsValid(oldtup)) /* should not happen */
+ elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
+
+ /* Build array of Oids to be inserted */
+ nodes_array = buildoidvector(nodes, numnodes);
+
+ /* Initialize fields */
+ MemSet(new_record, 0, sizeof(new_record));
+ MemSet(new_record_nulls, false, sizeof(new_record_nulls));
+ MemSet(new_record_repl, false, sizeof(new_record_repl));
+
+ /* Fields are updated depending on operation type */
+ switch (type)
+ {
+ case PGXC_CLASS_ALTER_DISTRIBUTION:
+ new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
+ new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
+ break;
+ case PGXC_CLASS_ALTER_NODES:
+ new_record_repl[Anum_pgxc_class_nodes - 1] = true;
+ break;
+ case PGXC_CLASS_ALTER_ALL:
+ default:
+ new_record_repl[Anum_pgxc_class_pcrelid - 1] = true;
+ new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
+ new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
+ new_record_repl[Anum_pgxc_class_nodes - 1] = true;
+ }
+
+ /* Set up new fields */
+ /* Relation Oid */
+ if (new_record_repl[Anum_pgxc_class_pcrelid - 1])
+ new_record[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid);
+
+ /* Locator type */
+ if (new_record_repl[Anum_pgxc_class_pclocatortype - 1])
+ new_record[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype);
+
+ /* Attribute number of distribution column */
+ if (new_record_repl[Anum_pgxc_class_pcattnum - 1])
+ new_record[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum);
+
+ /* Hash algorithm type */
+ if (new_record_repl[Anum_pgxc_class_pchashalgorithm - 1])
+ new_record[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm);
+
+ /* Hash buckets */
+ if (new_record_repl[Anum_pgxc_class_pchashbuckets - 1])
+ new_record[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets);
+
+ /* Node information */
+ if (new_record_repl[Anum_pgxc_class_nodes - 1])
+ new_record[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array);
+
+ /* Update relation */
+ newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel),
+ new_record,
+ new_record_nulls, new_record_repl);
++ CatalogTupleUpdate(rel, &oldtup->t_self, newtup);
+
+ heap_close(rel, RowExclusiveLock);
+}
+
+/*
+ * RemovePGXCClass():
+ * Remove extended PGXC information
+ */
+void
+RemovePgxcClass(Oid pcrelid)
+{
+ Relation relation;
+ HeapTuple tup;
+
+ /*
+ * Delete the pgxc_class tuple.
+ */
+ relation = heap_open(PgxcClassRelationId, RowExclusiveLock);
+ tup = SearchSysCache(PGXCCLASSRELID,
+ ObjectIdGetDatum(pcrelid),
+ 0, 0, 0);
+
+ if (!HeapTupleIsValid(tup)) /* should not happen */
+ elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
+
+ simple_heap_delete(relation, &tup->t_self);
+
+ ReleaseSysCache(tup);
+
+ heap_close(relation, RowExclusiveLock);
+}
* storage.c
* code to create and destroy physical storage for relations
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* analyze.c
* the Postgres statistics generator
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
*
* PostgreSQL object comments utility code.
*
- * Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ * Copyright (c) 1996-2017, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/commands/comment.c
* copy.c
* Implements the COPY utility command
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "miscadmin.h"
#include "optimizer/clauses.h"
#include "optimizer/planner.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/locator.h"
+#include "pgxc/remotecopy.h"
+#include "nodes/nodes.h"
+#include "pgxc/poolmgr.h"
+#include "catalog/pgxc_node.h"
+#endif
#include "nodes/makefuncs.h"
+#include "optimizer/pgxcship.h"
+ #include "parser/parse_relation.h"
#include "rewrite/rewriteHandler.h"
#include "storage/fd.h"
#include "tcop/tcopprot.h"
{
COPY_FILE, /* to/from file (or a piped program) */
COPY_OLD_FE, /* to/from frontend (2.0 protocol) */
- COPY_NEW_FE /* to/from frontend (3.0 protocol) */
- COPY_NEW_FE, /* to/from frontend (3.0 protocol) */
++ COPY_NEW_FE, /* to/from frontend (3.0 protocol) */
+#ifdef PGXC
- ,COPY_BUFFER /* Do not send, just prepare */
++ COPY_BUFFER, /* Do not send, just prepare */
+#endif
+ COPY_CALLBACK /* to/from callback function */
} CopyDest;
/*
/* Dump the accumulated row as one CopyData message */
(void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len);
break;
+#ifdef PGXC
+ case COPY_BUFFER:
+ /* Do not send yet anywhere, just return */
+ return;
+#endif
+ case COPY_CALLBACK:
+ Assert(false); /* Not yet supported. */
+ break;
}
resetStringInfo(fe_msgbuf);
bytesread += avail;
}
break;
+#ifdef PGXC
+ case COPY_BUFFER:
+ elog(ERROR, "COPY_BUFFER not allowed in this context");
+ break;
+#endif
+ case COPY_CALLBACK:
+ bytesread = cstate->data_source_cb(databuf, minread, maxread);
+ break;
}
return bytesread;
PreventCommandIfReadOnly("COPY FROM");
PreventCommandIfParallelMode("COPY FROM");
- cstate = BeginCopyFrom(rel, stmt->filename, stmt->is_program,
- stmt->attlist, stmt->options);
- cstate->range_table = range_table;
+ cstate = BeginCopyFrom(pstate, rel, stmt->filename, stmt->is_program,
+ NULL, stmt->attlist, stmt->options);
*processed = CopyFrom(cstate); /* copy from file to database */
+#ifdef XCP
+ /*
+ * We should record insert to distributed table.
+ * Bulk inserts into local tables are recorded when heap tuples are
+ * written.
+ */
+ if (IS_PGXC_COORDINATOR && rel->rd_locator_info)
+ pgstat_count_remote_insert(rel, (int) *processed);
+#endif
EndCopyFrom(cstate);
}
else
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("table \"%s\" does not have OIDs",
RelationGetRelationName(cstate->rel))));
+
+ /* Initialize state for CopyFrom tuple routing. */
+ if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+ {
+ PartitionDispatch *partition_dispatch_info;
+ ResultRelInfo *partitions;
+ TupleConversionMap **partition_tupconv_maps;
+ TupleTableSlot *partition_tuple_slot;
+ int num_parted,
+ num_partitions;
+
+ ExecSetupPartitionTupleRouting(rel,
+ &partition_dispatch_info,
+ &partitions,
+ &partition_tupconv_maps,
+ &partition_tuple_slot,
+ &num_parted, &num_partitions);
+ cstate->partition_dispatch_info = partition_dispatch_info;
+ cstate->num_dispatch = num_parted;
+ cstate->partitions = partitions;
+ cstate->num_partitions = num_partitions;
+ cstate->partition_tupconv_maps = partition_tupconv_maps;
+ cstate->partition_tuple_slot = partition_tuple_slot;
+ }
+#ifdef PGXC
+ /* Get copy statement and execution node information */
+ if (IS_PGXC_COORDINATOR)
+ {
+ RemoteCopyData *remoteCopyState = (RemoteCopyData *) palloc0(sizeof(RemoteCopyData));
+ List *attnums = CopyGetAttnums(tupDesc, cstate->rel, attnamelist);
+
+ /* Setup correct COPY FROM/TO flag */
+ remoteCopyState->is_from = is_from;
+
+ /* Get execution node list */
+ RemoteCopy_GetRelationLoc(remoteCopyState,
+ cstate->rel,
+ attnums);
+ /* Build remote query */
+ RemoteCopy_BuildStatement(remoteCopyState,
+ cstate->rel,
+ GetRemoteCopyOptions(cstate),
+ attnamelist,
+ attnums);
+
+ /* Then assign built structure */
+ cstate->remoteCopyState = remoteCopyState;
+ }
+#endif
}
else
{
errmsg("multi-statement DO INSTEAD rules are not supported for COPY")));
}
- query = (Query *) linitial(rewritten);
+ query = linitial_node(Query, rewritten);
- /* The grammar allows SELECT INTO, but we don't support that */
- if (query->utilityStmt != NULL &&
- IsA(query->utilityStmt, CreateTableAsStmt))
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("COPY (SELECT INTO) is not supported")));
-
Assert(query->utilityStmt == NULL);
/*
* tuples inserted by an INSERT command.
*/
processed++;
+
+ if (saved_resultRelInfo)
+ {
+ resultRelInfo = saved_resultRelInfo;
+ estate->es_result_relation_info = resultRelInfo;
+ }
}
+#ifdef PGXC
+ }
+#endif
}
/* Flush any remaining buffered tuples */
Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
values[defmap[i]] = ExecEvalExpr(defexprs[i], econtext,
- &nulls[defmap[i]], NULL);
+ &nulls[defmap[i]]);
}
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* Append default values to the data-row in output format. */
+ append_defvals(values, cstate);
+ }
+#endif
+
return true;
}
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("option \"%s\" cannot be specified with other options",
- dtablespace->defname)));
+ dtablespace->defname),
+ parser_errposition(pstate, dtablespace->location)));
/* this case isn't allowed within a transaction block */
- PreventTransactionChain(isTopLevel, "ALTER DATABASE SET TABLESPACE");
+#ifdef PGXC
+ /* ... but we allow it on remote nodes */
+ if (IS_PGXC_LOCAL_COORDINATOR)
+#endif
+ PreventTransactionChain(isTopLevel, "ALTER DATABASE SET TABLESPACE");
+
movedb(stmt->dbname, defGetString(dtablespace));
return InvalidOid;
}
case OCLASS_USER_MAPPING:
case OCLASS_DEFACL:
case OCLASS_EXTENSION:
+#ifdef PGXC
+ case OCLASS_PGXC_CLASS:
+ case OCLASS_PGXC_NODE:
+ case OCLASS_PGXC_GROUP:
+#endif
case OCLASS_POLICY:
- case OCLASS_AM:
+ case OCLASS_PUBLICATION:
+ case OCLASS_PUBLICATION_REL:
+ case OCLASS_SUBSCRIPTION:
+ case OCLASS_TRANSFORM:
return true;
+
+ /*
+ * There's intentionally no default: case here; we want the
+ * compiler to warn if a new OCLASS hasn't been handled above.
+ */
}
- return true;
+ /* Shouldn't get here, but if we do, say "no support" */
+ return false;
}
bool
* explain.c
* Explain query execution plans
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994-5, Regents of the University of California
*
* IDENTIFICATION
/* planner will not cope with utility statements */
if (query->commandType == CMD_UTILITY)
{
- ExplainOneUtility(query->utilityStmt, into, es, queryString, params,
- queryEnv);
+ /*
+ * If we are running EXPLAIN ANALYZE, transform the CTAS such that the
+ * target table is created first and select result is inserted into the
+ * table. The EXPLAIN ANALYZE would really just show the plan for the
+ * INSERT INTO generated by QueryRewriteCTAS, but that's OK.
+ */
+ if (es->analyze && IsA(query->utilityStmt, CreateTableAsStmt))
+ {
+ List *rewritten = QueryRewriteCTAS(query);
+ Assert(list_length(rewritten) == 1);
- ExplainOneQuery((Query *) linitial(rewritten), into, es,
- queryString, params);
++ ExplainOneQuery((Query *) linitial(rewritten), cursorOptions,
++ into, es, queryString, params, queryEnv);
+ }
+ else
+ ExplainOneUtility(query->utilityStmt, into, es,
- queryString, params);
++ queryString, params, queryEnv);
return;
}
* parsetree. We must fully execute each query before beginning parse
* analysis on the next one, since there may be interdependencies.
*/
- foreach(lc1, raw_parsetree_list)
+ forboth(lc1, raw_parsetree_list, lc3, querysource_list)
{
- Node *parsetree = (Node *) lfirst(lc1);
+ RawStmt *parsetree = lfirst_node(RawStmt, lc1);
+ char *querysource = (char *) lfirst(lc3);
List *stmt_list;
ListCell *lc2;
+ /* Be sure parser can see any DDL done so far */
+ CommandCounterIncrement();
+
stmt_list = pg_analyze_and_rewrite(parsetree,
- sql,
+ querysource,
NULL,
- 0);
+ 0,
+ NULL);
stmt_list = pg_plan_queries(stmt_list, CURSOR_OPT_PARALLEL_OK, NULL);
foreach(lc2, stmt_list)
{
QueryDesc *qdesc;
- qdesc = CreateQueryDesc((PlannedStmt *) stmt,
+ qdesc = CreateQueryDesc(stmt,
- sql,
+ querysource,
GetActiveSnapshot(), NULL,
- dest, NULL, 0);
+ dest, NULL, NULL, 0);
ExecutorStart(qdesc, 0);
- ExecutorRun(qdesc, ForwardScanDirection, 0);
+ ExecutorRun(qdesc, ForwardScanDirection, 0, true);
ExecutorFinish(qdesc);
ExecutorEnd(qdesc);
}
else
{
+ if (IsA(stmt->utilityStmt, TransactionStmt))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("transaction control statements are not allowed within an extension script")));
+
ProcessUtility(stmt,
- sql,
+ querysource,
PROCESS_UTILITY_QUERY,
NULL,
+ NULL,
dest,
+#ifdef PGXC
+ true, /* this is created at remote node level */
+#endif /* PGXC */
NULL);
}
csstmt->authrole = NULL; /* will be created by current user */
csstmt->schemaElts = NIL;
csstmt->if_not_exists = false;
- #ifdef PGXC
- CreateSchemaCommand(csstmt, NULL, true);
- #else
- CreateSchemaCommand(csstmt, NULL);
- #endif
+ CreateSchemaCommand(csstmt, "(generated CREATE SCHEMA command)",
- -1, -1);
++ true, -1, -1);
/*
* CreateSchemaCommand includes CommandCounterIncrement, so new
/* Ensure creation schema is the one given in IMPORT statement */
cstmt->base.relation->schemaname = pstrdup(stmt->local_schema);
+ /* No planning needed, just make a wrapper PlannedStmt */
+ pstmt = makeNode(PlannedStmt);
+ pstmt->commandType = CMD_UTILITY;
+ pstmt->canSetTag = false;
+ pstmt->utilityStmt = (Node *) cstmt;
+ pstmt->stmt_location = rs->stmt_location;
+ pstmt->stmt_len = rs->stmt_len;
+
/* Execute statement */
- ProcessUtility((Node *) cstmt,
+ ProcessUtility(pstmt,
cmd,
- PROCESS_UTILITY_SUBCOMMAND, NULL, NULL,
- None_Receiver, NULL);
+ PROCESS_UTILITY_SUBCOMMAND, NULL,
++ NULL,
+ None_Receiver,
- #ifdef XCP
+ false,
- #endif
+ NULL);
/* Be sure to advance the command counter between subcommands */
CommandCounterIncrement();
* indexcmds.c
* POSTGRES define and remove index code.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
#include "executor/executor.h"
#include "executor/spi.h"
#include "miscadmin.h"
+#ifdef PGXC
+#include "nodes/makefuncs.h"
+#endif
#include "parser/parse_relation.h"
+ #include "pgstat.h"
#include "rewrite/rewriteHandler.h"
#include "storage/lmgr.h"
#include "storage/smgr.h"
* storage management for portals (but doesn't run any queries in them).
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
*/
portal = CreatePortal(cstmt->portalname, false, false);
+#ifdef PGXC
+ /*
+ * Consume the command id of the command creating the cursor
+ */
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ GetCurrentCommandId(true);
+#endif
+
oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal));
- stmt = copyObject(stmt);
- stmt->utilityStmt = NULL; /* make it look like plain SELECT */
+ plan = copyObject(plan);
queryString = pstrdup(queryString);
* Create the CachedPlanSource before we do parse analysis, since it needs
* to see the unmodified raw parse tree.
*/
- plansource = CreateCachedPlan(stmt->query, queryString,
+ plansource = CreateCachedPlan(rawstmt, queryString,
+#ifdef PGXC
+ stmt->name,
+#endif
CreateCommandTag(stmt->query));
/* Transform list of TypeNames to array of type OIDs */
* schemacmds.c
* schema creation/manipulation commands
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
/*
* CREATE SCHEMA
+ *
+ * Note: caller should pass in location information for the whole
+ * CREATE SCHEMA statement, which in turn we pass down as the location
+ * of the component commands. This comports with our general plan of
+ * reporting location/len for the whole command even when executing
+ * a subquery.
*/
Oid
- #ifdef PGXC
- CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString, bool sentToRemote)
- #else
- CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString)
- #endif
+ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString,
++ bool sentToRemote,
+ int stmt_location, int stmt_len)
{
const char *schemaName = stmt->schemaname;
Oid namespaceId;
queryString,
PROCESS_UTILITY_SUBCOMMAND,
NULL,
+ NULL,
None_Receiver,
+#ifdef PGXC
+ true,
+#endif /* PGXC */
NULL);
+
/* make sure later steps can see the object created here */
CommandCounterIncrement();
}
* sequence.c
* PostgreSQL sequences support code.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
*-------------------------------------------------------------------------
*/
#include "postgres.h"
++#include <math.h>
+ #include "access/bufmask.h"
#include "access/htup_details.h"
#include "access/multixact.h"
#include "access/transam.h"
#include "utils/lsyscache.h"
#include "utils/resowner.h"
#include "utils/syscache.h"
+#include "commands/dbcommands.h"
+
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+/* PGXC_COORD */
+#include "access/gtm.h"
+#include "utils/memutils.h"
+#ifdef XCP
+#include "utils/timestamp.h"
+#endif
+#endif
+ #include "utils/varlena.h"
-
/*
* We don't want to log each fetching of a value from a sequence,
* so we pre-log a few fetches in advance. In the event of
int64 cached; /* last value already cached for nextval */
/* if last != cached, we have not used up all the cached values */
int64 increment; /* copy of sequence's increment field */
- /* note that increment is zero until we first do read_seq_tuple() */
+ /* note that increment is zero until we first do nextval_internal() */
+#ifdef XCP
+ TimestampTz last_call_time; /* the time when the last call as made */
+ int64 range_multiplier; /* multiply this value with 2 next time */
+#endif
} SeqTableData;
typedef SeqTableData *SeqTable;
static SeqTableData *last_used_seq = NULL;
static void fill_seq_with_data(Relation rel, HeapTuple tuple);
- static int64 nextval_internal(Oid relid);
- static Relation open_share_lock(SeqTable seq);
+ static Relation lock_and_open_sequence(SeqTable seq);
static void create_seq_hashtable(void);
static void init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel);
- static Form_pg_sequence read_seq_tuple(SeqTable elm, Relation rel,
- Buffer *buf, HeapTuple seqtuple);
- #ifdef PGXC
- static void init_params(List *options, bool isInit,
- Form_pg_sequence new, List **owned_by, bool *is_restart);
- #else
- static void init_params(List *options, bool isInit,
- Form_pg_sequence new, List **owned_by);
- #endif
+ static Form_pg_sequence_data read_seq_tuple(Relation rel,
+ Buffer *buf, HeapTuple seqdatatuple);
+ static LOCKMODE alter_sequence_get_lock_level(List *options);
+ static void init_params(ParseState *pstate, List *options, bool for_identity,
+ bool isInit,
+ Form_pg_sequence seqform,
+ bool *changed_seqform,
- Form_pg_sequence_data seqdataform, List **owned_by);
++ Form_pg_sequence_data seqdataform, List **owned_by,
++ bool *is_restart);
static void do_setval(Oid relid, int64 next, bool iscalled);
- static void process_owned_by(Relation seqrel, List *owned_by);
+ static void process_owned_by(Relation seqrel, List *owned_by, bool for_identity);
/*
TupleDesc tupDesc;
Datum value[SEQ_COL_LASTCOL];
bool null[SEQ_COL_LASTCOL];
+ Datum pgs_values[Natts_pg_sequence];
+ bool pgs_nulls[Natts_pg_sequence];
int i;
- NameData name;
+#ifdef PGXC /* PGXC_COORD */
+ GTM_Sequence start_value = 1;
+ GTM_Sequence min_value = 1;
+ GTM_Sequence max_value = InvalidSequenceValue;
+ GTM_Sequence increment = 1;
+ bool cycle = false;
+ bool is_restart;
+#endif
/* Unlogged sequences are not implemented -- not clear if useful. */
if (seq->sequence->relpersistence == RELPERSISTENCE_UNLOGGED)
}
/* Check and set all option values */
- #ifdef PGXC
- init_params(seq->options, true, &new, &owned_by, &is_restart);
- #else
- init_params(seq->options, true, &new, &owned_by);
- #endif
- init_params(pstate, seq->options, seq->for_identity, true, &seqform, &changed_seqform, &seqdataform, &owned_by);
++ init_params(pstate, seq->options, seq->for_identity, true, &seqform,
++ &changed_seqform, &seqdataform, &owned_by, &is_restart);
/*
* Create relation (and fill value[] and null[] for the tuple)
heap_close(rel, NoLock);
+ /* fill in pg_sequence */
+ rel = heap_open(SequenceRelationId, RowExclusiveLock);
+ tupDesc = RelationGetDescr(rel);
+
+ memset(pgs_nulls, 0, sizeof(pgs_nulls));
+
+ pgs_values[Anum_pg_sequence_seqrelid - 1] = ObjectIdGetDatum(seqoid);
+ pgs_values[Anum_pg_sequence_seqtypid - 1] = ObjectIdGetDatum(seqform.seqtypid);
+ pgs_values[Anum_pg_sequence_seqstart - 1] = Int64GetDatumFast(seqform.seqstart);
+ pgs_values[Anum_pg_sequence_seqincrement - 1] = Int64GetDatumFast(seqform.seqincrement);
+ pgs_values[Anum_pg_sequence_seqmax - 1] = Int64GetDatumFast(seqform.seqmax);
+ pgs_values[Anum_pg_sequence_seqmin - 1] = Int64GetDatumFast(seqform.seqmin);
+ pgs_values[Anum_pg_sequence_seqcache - 1] = Int64GetDatumFast(seqform.seqcache);
+ pgs_values[Anum_pg_sequence_seqcycle - 1] = BoolGetDatum(seqform.seqcycle);
+
+ tuple = heap_form_tuple(tupDesc, pgs_values, pgs_nulls);
+ CatalogTupleInsert(rel, tuple);
+
+ heap_freetuple(tuple);
+ heap_close(rel, RowExclusiveLock);
+
+#ifdef PGXC /* PGXC_COORD */
+ /*
+ * Remote Coordinator is in charge of creating sequence in GTM.
+ * If sequence is temporary, it is not necessary to create it on GTM.
+ */
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ char *seqname = GetGlobalSeqName(rel, NULL, NULL);
+
+ /* We also need to create it on the GTM */
+ if (CreateSequenceGTM(seqname,
+ increment,
+ min_value,
+ max_value,
+ start_value, cycle) < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not create sequence")));
+ }
+
+
+ pfree(seqname);
+ }
+#endif
return address;
}
SeqTable elm;
Relation seqrel;
Buffer buf;
- HeapTupleData seqtuple;
- Form_pg_sequence seq;
- FormData_pg_sequence new;
+ HeapTupleData seqdatatuple;
+ Form_pg_sequence seqform;
+ Form_pg_sequence_data seqdata;
+ FormData_pg_sequence_data newseqdata;
+ bool changed_seqform = false;
List *owned_by;
+#ifdef PGXC
+ GTM_Sequence start_value;
+ GTM_Sequence last_value;
+ GTM_Sequence min_value;
+ GTM_Sequence max_value;
+ GTM_Sequence increment;
+ bool cycle;
+ bool is_restart;
+#endif
ObjectAddress address;
+ Relation rel;
+ HeapTuple tuple;
/* Open and lock sequence. */
- relid = RangeVarGetRelid(stmt->sequence, AccessShareLock, stmt->missing_ok);
+ relid = RangeVarGetRelid(stmt->sequence,
+ alter_sequence_get_lock_level(stmt->options),
+ stmt->missing_ok);
if (relid == InvalidOid)
{
ereport(NOTICE,
aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
stmt->sequence->relname);
- /* lock page' buffer and read tuple into new sequence structure */
- seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple);
+ rel = heap_open(SequenceRelationId, RowExclusiveLock);
+ tuple = SearchSysCacheCopy1(SEQRELID,
+ ObjectIdGetDatum(relid));
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for sequence %u",
+ relid);
+
+ seqform = (Form_pg_sequence) GETSTRUCT(tuple);
- /* Copy old values of options into workspace */
- memcpy(&new, seq, sizeof(FormData_pg_sequence));
+ /* lock page's buffer and read tuple into new sequence structure */
+ seqdata = read_seq_tuple(seqrel, &buf, &seqdatatuple);
+
+ /* Copy old sequence data into workspace */
+ memcpy(&newseqdata, seqdata, sizeof(FormData_pg_sequence_data));
/* Check and set new values */
- #ifdef PGXC
- init_params(stmt->options, false, &new, &owned_by, &is_restart);
- #else
- init_params(stmt->options, false, &new, &owned_by);
- #endif
- init_params(pstate, stmt->options, stmt->for_identity, false, seqform, &changed_seqform, &newseqdata, &owned_by);
++ init_params(pstate, stmt->options, stmt->for_identity, false, seqform,
++ &changed_seqform, &newseqdata, &owned_by, &is_restart);
/* Clear local cache so that we don't think we have cached numbers */
/* Note that we do not change the currval() state */
GetTopTransactionId();
/* Now okay to update the on-disk tuple */
- increment = new.increment_by;
- min_value = new.min_value;
- max_value = new.max_value;
- start_value = new.start_value;
- last_value = new.last_value;
- cycle = new.is_cycled;
+#ifdef PGXC
++ increment = seqform->seqincrement;
++ min_value = seqform->seqmin;
++ max_value = seqform->seqmax;
++ start_value = seqform->seqstart;
++ last_value = elm->last;
++ cycle = seqform->seqcycle;
+#endif
+
START_CRIT_SECTION();
- memcpy(seq, &new, sizeof(FormData_pg_sequence));
+ memcpy(seqdata, &newseqdata, sizeof(FormData_pg_sequence_data));
MarkBufferDirty(buf);
ObjectAddressSet(address, RelationRelationId, relid);
+ if (changed_seqform)
+ CatalogTupleUpdate(rel, &tuple->t_self, tuple);
+ heap_close(rel, RowExclusiveLock);
+
relation_close(seqrel, NoLock);
+#ifdef PGXC
+ /*
+ * Remote Coordinator is in charge of create sequence in GTM
+ * If sequence is temporary, no need to go through GTM.
+ */
+ if (IS_PGXC_LOCAL_COORDINATOR && seqrel->rd_backend != MyBackendId)
+ {
+ char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
+
+ /* We also need to create it on the GTM */
+ if (AlterSequenceGTM(seqname,
+ increment,
+ min_value,
+ max_value,
+ start_value,
+ last_value,
+ cycle,
+ is_restart) < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not alter sequence")));
+ pfree(seqname);
+ }
+#endif
return address;
}
return elm->last;
}
+ pgstuple = SearchSysCache1(SEQRELID, ObjectIdGetDatum(relid));
+ if (!HeapTupleIsValid(pgstuple))
+ elog(ERROR, "cache lookup failed for sequence %u", relid);
+ pgsform = (Form_pg_sequence) GETSTRUCT(pgstuple);
+ incby = pgsform->seqincrement;
+ maxv = pgsform->seqmax;
+ minv = pgsform->seqmin;
+ cache = pgsform->seqcache;
+ cycle = pgsform->seqcycle;
+ ReleaseSysCache(pgstuple);
+
/* lock page' buffer and read tuple */
- seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple);
+ seq = read_seq_tuple(seqrel, &buf, &seqdatatuple);
page = BufferGetPage(buf);
- int64 range = seq->cache_value; /* how many values to ask from GTM? */
+ {
- fetch = cache = seq->cache_value;
++ int64 range = cache; /* how many values to ask from GTM? */
+ int64 rangemax; /* the max value returned from the GTM for our request */
+ char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
+
+ /*
+ * Above, we still use the page as a locking mechanism to handle
+ * concurrency
+ *
+ * If the user has set a CACHE parameter, we use that. Else we pass in
+ * the SequenceRangeVal value
+ */
+ if (range == DEFAULT_CACHEVAL && SequenceRangeVal > range)
+ {
+ TimestampTz curtime = GetCurrentTimestamp();
+
+ if (!TimestampDifferenceExceeds(elm->last_call_time,
+ curtime, 1000))
+ {
+ /*
+ * The previous GetNextValGTM call was made just a while back.
+ * Request double the range of what was requested in the
+ * earlier call. Honor the SequenceRangeVal boundary
+ * value to limit very large range requests!
+ */
+ elm->range_multiplier *= 2;
+ if (elm->range_multiplier < SequenceRangeVal)
+ range = elm->range_multiplier;
+ else
+ elm->range_multiplier = range = SequenceRangeVal;
+
+ elog(DEBUG1, "increase sequence range %ld", range);
+ }
+ else if (TimestampDifferenceExceeds(elm->last_call_time,
+ curtime, 5000))
+ {
+ /* The previous GetNextValGTM call was pretty old */
+ range = elm->range_multiplier = DEFAULT_CACHEVAL;
+ elog(DEBUG1, "reset sequence range %ld", range);
+ }
+ else if (TimestampDifferenceExceeds(elm->last_call_time,
+ curtime, 3000))
+ {
+ /*
+ * The previous GetNextValGTM call was made quite some time
+ * ago. Try to reduce the range request to reduce the gap
+ */
+ if (elm->range_multiplier != DEFAULT_CACHEVAL)
+ {
+ range = elm->range_multiplier =
+ rint(elm->range_multiplier/2);
+ elog(DEBUG1, "decrease sequence range %ld", range);
+ }
+ }
+ else
+ {
+ /*
+ * Current range_multiplier alllows to cache sequence values
+ * for 1-3 seconds of work. Keep that rate.
+ */
+ range = elm->range_multiplier;
+ }
+ elm->last_call_time = curtime;
+ }
+
+ result = (int64) GetNextValGTM(seqname, range, &rangemax);
+ pfree(seqname);
+
+ /* Update the on-disk data */
+ seq->last_value = result; /* last fetched number */
+ seq->is_called = true;
+
+ /* save info in local cache */
+ elm->last = result; /* last returned number */
+ elm->cached = rangemax; /* last fetched range max limit */
+ elm->last_valid = true;
+
+ last_used_seq = elm;
+ }
+
+ elm->increment = incby;
+ last = next = result = seq->last_value;
+ fetch = cache;
log = seq->log_cnt;
if (!seq->is_called)
bufm, bufx)));
}
- /* Set the currval() state only if iscalled = true */
- if (iscalled)
{
- elm->last = next; /* last returned number */
- elm->last_valid = true;
- }
+ char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
- Page page = BufferGetPage(buf);
+ if (SetValGTM(seqname, next, iscalled) < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not obtain sequence value")));
+ pfree(seqname);
+ /* Update the on-disk data */
+ seq->last_value = next; /* last fetched number */
+ seq->is_called = iscalled;
+ seq->log_cnt = (iscalled) ? 0 : 1;
+
+ if (iscalled)
+ {
+ elm->last = next; /* last returned number */
+ elm->last_valid = true;
+ }
++ }
+ /* In any case, forget any future cached numbers */
+ elm->cached = elm->last;
+
+ /* check the comment above nextval_internal()'s equivalent call. */
+ if (RelationNeedsWAL(seqrel))
+ GetTopTransactionId();
+
+ /* ready to change the on-disk (or really, in-buffer) tuple */
+ START_CRIT_SECTION();
+
+ seq->last_value = next; /* last fetched number */
+ seq->is_called = iscalled;
+ seq->log_cnt = 0;
+
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(seqrel))
+ {
+ xl_seq_rec xlrec;
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
+
+ xlrec.node = seqrel->rd_node;
+ XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec));
+ XLogRegisterData((char *) seqdatatuple.t_data, seqdatatuple.t_len);
+
+ recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG);
- PageSetLSN(page, recptr);
+ elm->cached = elm->last;
}
- END_CRIT_SECTION();
-
UnlockReleaseBuffer(buf);
relation_close(seqrel, NoLock);
elm->filenode = InvalidOid;
elm->lxid = InvalidLocalTransactionId;
elm->last_valid = false;
- elm->last = elm->cached = elm->increment = 0;
+#ifdef XCP
+ elm->last_call_time = 0;
+ elm->range_multiplier = DEFAULT_CACHEVAL;
+#endif
+ elm->last = elm->cached = 0;
}
/*
* otherwise, do not change existing options that aren't explicitly overridden.
*/
static void
- #ifdef PGXC
- init_params(List *options, bool isInit,
- Form_pg_sequence new, List **owned_by, bool *is_restart)
- #else
- init_params(List *options, bool isInit,
- Form_pg_sequence new, List **owned_by)
- #endif
+ init_params(ParseState *pstate, List *options, bool for_identity,
+ bool isInit,
+ Form_pg_sequence seqform,
+ bool *changed_seqform,
+ Form_pg_sequence_data seqdataform,
- List **owned_by)
++ List **owned_by,
++ bool *is_restart)
{
+ DefElem *as_type = NULL;
DefElem *start_value = NULL;
DefElem *restart_value = NULL;
DefElem *increment_by = NULL;
DefElem *cache_value = NULL;
DefElem *is_cycled = NULL;
ListCell *option;
+ bool reset_max_value = false;
+ bool reset_min_value = false;
+#ifdef PGXC
+ *is_restart = false;
+#endif
+
*owned_by = NIL;
foreach(option, options)
if (restart_value != NULL)
{
if (restart_value->arg != NULL)
- new->last_value = defGetInt64(restart_value);
+ seqdataform->last_value = defGetInt64(restart_value);
else
- new->last_value = new->start_value;
+ seqdataform->last_value = seqform->seqstart;
+#ifdef PGXC
+ *is_restart = true;
+#endif
- new->is_called = false;
- new->log_cnt = 0;
++ seqdataform->last_value = seqform->seqstart;
+ seqdataform->is_called = false;
+ seqdataform->log_cnt = 0;
}
else if (isInit)
{
errmsg("CACHE (%s) must be greater than zero",
buf)));
}
- new->log_cnt = 0;
+ seqdataform->log_cnt = 0;
}
else if (isInit)
- new->cache_value = 1;
+ {
+ seqform->seqcache = 1;
+ *changed_seqform = true;
+ }
}
+#ifdef PGXC
+/*
+ * GetGlobalSeqName
+ *
+ * Returns a global sequence name adapted to GTM
+ * Name format is dbname.schemaname.seqname
+ * so as to identify in a unique way in the whole cluster each sequence
+ */
+char *
+GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schemaname)
+{
+ char *seqname, *dbname, *relname;
+ char namespace[NAMEDATALEN * 2];
+ int charlen;
+ bool is_temp = seqrel->rd_backend == MyBackendId;
+ /* Get all the necessary relation names */
+ dbname = get_database_name(seqrel->rd_node.dbNode);
+
+ if (new_seqname)
+ relname = (char *) new_seqname;
+ else
+ relname = RelationGetRelationName(seqrel);
+
+ if (!is_temp)
+ {
+ /*
+ * For a permanent sequence, use schema qualified name. That can
+ * uniquely identify the sequences.
+ */
+ char *schema = get_namespace_name(RelationGetNamespace(seqrel));
+ sprintf(namespace, "%s", new_schemaname ? new_schemaname : schema);
+ pfree(schema);
+ }
+ else
+ {
+ /*
+ * For temporary sequences, we use originating coordinator name and
+ * originating coordinator PID to qualify the sequence name. If we are
+ * running on the local coordinator, we can readily fetch that
+ * information from PGXCNodeName and MyProcPid, but when running on
+ * remote datanode, we must consult MyCoordName and MyProcPid to get
+ * the correct information.
+ */
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ sprintf(namespace, "%s.%d", PGXCNodeName, MyProcPid);
+ else
+ sprintf(namespace, "%s.%d", MyCoordName, MyCoordPid);
+ }
+
+ /* Calculate the global name size including the dots and \0 */
+ charlen = strlen(dbname) + strlen(namespace) + strlen(relname) + 3;
+ seqname = (char *) palloc(charlen);
+
+ /* Form a unique sequence name with schema and database name for GTM */
+ snprintf(seqname,
+ charlen,
+ "%s.%s.%s",
+ dbname,
+ namespace,
+ relname);
+
+ if (dbname)
+ pfree(dbname);
+
+ return seqname;
+}
+
+/*
+ * IsTempSequence
+ *
+ * Determine if given sequence is temporary or not.
+ */
+bool
+IsTempSequence(Oid relid)
+{
+ Relation seqrel;
+ bool res;
+ SeqTable elm;
+
+ /* open and AccessShareLock sequence */
+ init_sequence(relid, &elm, &seqrel);
+
+ res = seqrel->rd_backend == MyBackendId;
+ relation_close(seqrel, NoLock);
+ return res;
+}
+#endif
+
/*
* Process an OWNED BY option for CREATE/ALTER SEQUENCE
*
* tablecmds.c
* Commands for creating and altering table structures and settings
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
/* No command-specific prep needed */
pass = AT_PASS_MISC;
break;
+#ifdef PGXC
+ case AT_DistributeBy:
+ case AT_SubCluster:
+ case AT_AddNodeList:
+ case AT_DeleteNodeList:
+ ATSimplePermissions(rel, ATT_TABLE);
+ /* No command-specific prep needed */
+ pass = AT_PASS_DISTRIB;
+ break;
+#endif
+ case AT_AttachPartition:
+ case AT_DetachPartition:
+ ATSimplePermissions(rel, ATT_TABLE);
+ /* No command-specific prep needed */
+ pass = AT_PASS_MISC;
+ break;
default: /* oops */
elog(ERROR, "unrecognized alter table type: %d",
(int) cmd->subtype);
case AT_GenericOptions:
ATExecGenericOptions(rel, (List *) cmd->def);
break;
- #ifdef PGXC
+ case AT_DistributeBy:
+ AtExecDistributeBy(rel, (DistributeBy *) cmd->def);
+ break;
+ case AT_SubCluster:
+ AtExecSubCluster(rel, (PGXCSubCluster *) cmd->def);
+ break;
+ case AT_AddNodeList:
+ AtExecAddNode(rel, (List *) cmd->def);
+ break;
+ case AT_DeleteNodeList:
+ AtExecDeleteNode(rel, (List *) cmd->def);
+ break;
- #endif
+ case AT_AttachPartition:
+ ATExecAttachPartition(wqueue, rel, (PartitionCmd *) cmd->def);
+ break;
+ case AT_DetachPartition:
+ ATExecDetachPartition(rel, ((PartitionCmd *) cmd->def)->name);
+ break;
default: /* oops */
elog(ERROR, "unrecognized alter table type: %d",
(int) cmd->subtype);
{
AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab);
- /* Foreign tables have no storage. */
- if (tab->relkind == RELKIND_FOREIGN_TABLE)
+#ifdef PGXC
+ /* Forbid table rewrite operations with online data redistribution */
+ if (tab->rewrite &&
+ list_length(tab->subcmds[AT_PASS_DISTRIB]) > 0 &&
+ IS_PGXC_LOCAL_COORDINATOR)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("Incompatible operation with data redistribution")));
+#endif
+
+ /* Foreign tables have no storage, nor do partitioned tables. */
+ if (tab->relkind == RELKIND_FOREIGN_TABLE ||
+ tab->relkind == RELKIND_PARTITIONED_TABLE)
continue;
/*
* relations, we can skip truncating ON COMMIT DELETE ROWS
* tables, as they must still be empty.
*/
- if (MyXactAccessedTempRel)
+#ifndef XCP
+ /*
+ * This optimization does not work in XL since temporary tables
+ * are handled differently in XL.
+ */
+ if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL))
+#endif
oids_to_truncate = lappend_oid(oids_to_truncate, oc->relid);
break;
case ONCOMMIT_DROP:
ReleaseSysCache(tuple);
}
+#ifdef PGXC
+/*
+ * IsTempTable
+ *
+ * Check if given table Oid is temporary.
+ */
+bool
+IsTempTable(Oid relid)
+{
+ Relation rel;
+ bool res;
+ /*
+ * PGXCTODO: Is it correct to open without locks?
+ * we just check if this table is temporary though...
+ */
+ rel = relation_open(relid, NoLock);
+ res = rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP;
+ relation_close(rel, NoLock);
+ return res;
+}
+
+bool
+IsLocalTempTable(Oid relid)
+{
+ Relation rel;
+ bool res;
+ rel = relation_open(relid, NoLock);
+ res = (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+ rel->rd_locator_info == NULL);
+ relation_close(rel, NoLock);
+ return res;
+}
+
+/*
+ * IsIndexUsingTemp
+ *
+ * Check if given index relation uses temporary tables.
+ */
+bool
+IsIndexUsingTempTable(Oid relid)
+{
+ bool res = false;
+ HeapTuple tuple;
+ Oid parent_id = InvalidOid;
+
+ tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(relid));
+ if (HeapTupleIsValid(tuple))
+ {
+ Form_pg_index index = (Form_pg_index) GETSTRUCT(tuple);
+ parent_id = index->indrelid;
+
+ /* Release system cache BEFORE looking at the parent table */
+ ReleaseSysCache(tuple);
+
+ res = IsTempTable(parent_id);
+ }
+ else
+ res = false; /* Default case */
+
+ return res;
+}
+
+/*
+ * IsOnCommitActions
+ *
+ * Check if there are any on-commit actions activated.
+ */
+bool
+IsOnCommitActions(void)
+{
+ return list_length(on_commits) > 0;
+}
+
+/*
+ * DropTableThrowErrorExternal
+ *
+ * Error interface for DROP when looking for execution node type.
+ */
+void
+DropTableThrowErrorExternal(RangeVar *relation, ObjectType removeType, bool missing_ok)
+{
+ char relkind;
+
+ /* Determine required relkind */
+ switch (removeType)
+ {
+ case OBJECT_TABLE:
+ relkind = RELKIND_RELATION;
+ break;
+
+ case OBJECT_INDEX:
+ relkind = RELKIND_INDEX;
+ break;
+
+ case OBJECT_SEQUENCE:
+ relkind = RELKIND_SEQUENCE;
+ break;
+
+ case OBJECT_VIEW:
+ relkind = RELKIND_VIEW;
+ break;
+
+ case OBJECT_FOREIGN_TABLE:
+ relkind = RELKIND_FOREIGN_TABLE;
+ break;
+
+ default:
+ elog(ERROR, "unrecognized drop object type: %d",
+ (int) removeType);
+ relkind = 0; /* keep compiler quiet */
+ break;
+ }
+
+ DropErrorMsgNonExistent(relation, relkind, missing_ok);
+}
+#endif
++
+ /*
+ * Transform any expressions present in the partition key
+ *
+ * Returns a transformed PartitionSpec, as well as the strategy code
+ */
+ static PartitionSpec *
+ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy)
+ {
+ PartitionSpec *newspec;
+ ParseState *pstate;
+ RangeTblEntry *rte;
+ ListCell *l;
+
+ newspec = makeNode(PartitionSpec);
+
+ newspec->strategy = partspec->strategy;
+ newspec->partParams = NIL;
+ newspec->location = partspec->location;
+
+ /* Parse partitioning strategy name */
+ if (pg_strcasecmp(partspec->strategy, "list") == 0)
+ *strategy = PARTITION_STRATEGY_LIST;
+ else if (pg_strcasecmp(partspec->strategy, "range") == 0)
+ *strategy = PARTITION_STRATEGY_RANGE;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized partitioning strategy \"%s\"",
+ partspec->strategy)));
+
+ /* Check valid number of columns for strategy */
+ if (*strategy == PARTITION_STRATEGY_LIST &&
+ list_length(partspec->partParams) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("cannot use \"list\" partition strategy with more than one column")));
+
+ /*
+ * Create a dummy ParseState and insert the target relation as its sole
+ * rangetable entry. We need a ParseState for transformExpr.
+ */
+ pstate = make_parsestate(NULL);
+ rte = addRangeTableEntryForRelation(pstate, rel, NULL, false, true);
+ addRTEtoQuery(pstate, rte, true, true, true);
+
+ /* take care of any partition expressions */
+ foreach(l, partspec->partParams)
+ {
+ PartitionElem *pelem = castNode(PartitionElem, lfirst(l));
+ ListCell *lc;
+
+ /* Check for PARTITION BY ... (foo, foo) */
+ foreach(lc, newspec->partParams)
+ {
+ PartitionElem *pparam = castNode(PartitionElem, lfirst(lc));
+
+ if (pelem->name && pparam->name &&
+ strcmp(pelem->name, pparam->name) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_COLUMN),
+ errmsg("column \"%s\" appears more than once in partition key",
+ pelem->name),
+ parser_errposition(pstate, pelem->location)));
+ }
+
+ if (pelem->expr)
+ {
+ /* Copy, to avoid scribbling on the input */
+ pelem = copyObject(pelem);
+
+ /* Now do parse transformation of the expression */
+ pelem->expr = transformExpr(pstate, pelem->expr,
+ EXPR_KIND_PARTITION_EXPRESSION);
+
+ /* we have to fix its collations too */
+ assign_expr_collations(pstate, pelem->expr);
+ }
+
+ newspec->partParams = lappend(newspec->partParams, pelem);
+ }
+
+ return newspec;
+ }
+
+ /*
+ * Compute per-partition-column information from a list of PartitionElems.
+ * Expressions in the PartitionElems must be parse-analyzed already.
+ */
+ static void
+ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
+ List **partexprs, Oid *partopclass, Oid *partcollation)
+ {
+ int attn;
+ ListCell *lc;
+
+ attn = 0;
+ foreach(lc, partParams)
+ {
+ PartitionElem *pelem = castNode(PartitionElem, lfirst(lc));
+ Oid atttype;
+ Oid attcollation;
+
+ if (pelem->name != NULL)
+ {
+ /* Simple attribute reference */
+ HeapTuple atttuple;
+ Form_pg_attribute attform;
+
+ atttuple = SearchSysCacheAttName(RelationGetRelid(rel),
+ pelem->name);
+ if (!HeapTupleIsValid(atttuple))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("column \"%s\" named in partition key does not exist",
+ pelem->name)));
+ attform = (Form_pg_attribute) GETSTRUCT(atttuple);
+
+ if (attform->attnum <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("cannot use system column \"%s\" in partition key",
+ pelem->name)));
+
+ partattrs[attn] = attform->attnum;
+ atttype = attform->atttypid;
+ attcollation = attform->attcollation;
+ ReleaseSysCache(atttuple);
+ }
+ else
+ {
+ /* Expression */
+ Node *expr = pelem->expr;
+
+ Assert(expr != NULL);
+ atttype = exprType(expr);
+ attcollation = exprCollation(expr);
+
+ /*
+ * Strip any top-level COLLATE clause. This ensures that we treat
+ * "x COLLATE y" and "(x COLLATE y)" alike.
+ */
+ while (IsA(expr, CollateExpr))
+ expr = (Node *) ((CollateExpr *) expr)->arg;
+
+ if (IsA(expr, Var) &&
+ ((Var *) expr)->varattno > 0)
+ {
+ /*
+ * User wrote "(column)" or "(column COLLATE something)".
+ * Treat it like simple attribute anyway.
+ */
+ partattrs[attn] = ((Var *) expr)->varattno;
+ }
+ else
+ {
+ Bitmapset *expr_attrs = NULL;
+ int i;
+
+ partattrs[attn] = 0; /* marks the column as expression */
+ *partexprs = lappend(*partexprs, expr);
+
+ /*
+ * Try to simplify the expression before checking for
+ * mutability. The main practical value of doing it in this
+ * order is that an inline-able SQL-language function will be
+ * accepted if its expansion is immutable, whether or not the
+ * function itself is marked immutable.
+ *
+ * Note that expression_planner does not change the passed in
+ * expression destructively and we have already saved the
+ * expression to be stored into the catalog above.
+ */
+ expr = (Node *) expression_planner((Expr *) expr);
+
+ /*
+ * Partition expression cannot contain mutable functions,
+ * because a given row must always map to the same partition
+ * as long as there is no change in the partition boundary
+ * structure.
+ */
+ if (contain_mutable_functions(expr))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("functions in partition key expression must be marked IMMUTABLE")));
+
+ /*
+ * transformPartitionSpec() should have already rejected
+ * subqueries, aggregates, window functions, and SRFs, based
+ * on the EXPR_KIND_ for partition expressions.
+ */
+
+ /*
+ * Cannot have expressions containing whole-row references or
+ * system column references.
+ */
+ pull_varattnos(expr, 1, &expr_attrs);
+ if (bms_is_member(0 - FirstLowInvalidHeapAttributeNumber,
+ expr_attrs))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("partition key expressions cannot contain whole-row references")));
+ for (i = FirstLowInvalidHeapAttributeNumber; i < 0; i++)
+ {
+ if (bms_is_member(i - FirstLowInvalidHeapAttributeNumber,
+ expr_attrs))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("partition key expressions cannot contain system column references")));
+ }
+
+ /*
+ * While it is not exactly *wrong* for a partition expression
+ * to be a constant, it seems better to reject such keys.
+ */
+ if (IsA(expr, Const))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("cannot use constant expression as partition key")));
+ }
+ }
+
+ /*
+ * Apply collation override if any
+ */
+ if (pelem->collation)
+ attcollation = get_collation_oid(pelem->collation, false);
+
+ /*
+ * Check we have a collation iff it's a collatable type. The only
+ * expected failures here are (1) COLLATE applied to a noncollatable
+ * type, or (2) partition expression had an unresolved collation. But
+ * we might as well code this to be a complete consistency check.
+ */
+ if (type_is_collatable(atttype))
+ {
+ if (!OidIsValid(attcollation))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
+ errmsg("could not determine which collation to use for partition expression"),
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
+ }
+ else
+ {
+ if (OidIsValid(attcollation))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("collations are not supported by type %s",
+ format_type_be(atttype))));
+ }
+
+ partcollation[attn] = attcollation;
+
+ /*
+ * Identify a btree opclass to use. Currently, we use only btree
+ * operators, which seems enough for list and range partitioning.
+ */
+ if (!pelem->opclass)
+ {
+ partopclass[attn] = GetDefaultOpClass(atttype, BTREE_AM_OID);
+
+ if (!OidIsValid(partopclass[attn]))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("data type %s has no default btree operator class",
+ format_type_be(atttype)),
+ errhint("You must specify a btree operator class or define a default btree operator class for the data type.")));
+ }
+ else
+ partopclass[attn] = ResolveOpClass(pelem->opclass,
+ atttype,
+ "btree",
+ BTREE_AM_OID);
+
+ attn++;
+ }
+ }
+
+ /*
+ * ALTER TABLE <name> ATTACH PARTITION <partition-name> FOR VALUES
+ *
+ * Return the address of the newly attached partition.
+ */
+ static ObjectAddress
+ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
+ {
+ PartitionKey key = RelationGetPartitionKey(rel);
+ Relation attachRel,
+ catalog;
+ List *childrels;
+ TupleConstr *attachRel_constr;
+ List *partConstraint,
+ *existConstraint;
+ SysScanDesc scan;
+ ScanKeyData skey;
+ AttrNumber attno;
+ int natts;
+ TupleDesc tupleDesc;
+ bool skip_validate = false;
+ ObjectAddress address;
+
+ attachRel = heap_openrv(cmd->name, AccessExclusiveLock);
+
+ /*
+ * Must be owner of both parent and source table -- parent was checked by
+ * ATSimplePermissions call in ATPrepCmd
+ */
+ ATSimplePermissions(attachRel, ATT_TABLE | ATT_FOREIGN_TABLE);
+
+ /* A partition can only have one parent */
+ if (attachRel->rd_rel->relispartition)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is already a partition",
+ RelationGetRelationName(attachRel))));
+
+ if (OidIsValid(attachRel->rd_rel->reloftype))
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach a typed table as partition")));
+
+ /*
+ * Table being attached should not already be part of inheritance; either
+ * as a child table...
+ */
+ catalog = heap_open(InheritsRelationId, AccessShareLock);
+ ScanKeyInit(&skey,
+ Anum_pg_inherits_inhrelid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(RelationGetRelid(attachRel)));
+ scan = systable_beginscan(catalog, InheritsRelidSeqnoIndexId, true,
+ NULL, 1, &skey);
+ if (HeapTupleIsValid(systable_getnext(scan)))
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach inheritance child as partition")));
+ systable_endscan(scan);
+
+ /* ...or as a parent table (except the case when it is partitioned) */
+ ScanKeyInit(&skey,
+ Anum_pg_inherits_inhparent,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(RelationGetRelid(attachRel)));
+ scan = systable_beginscan(catalog, InheritsParentIndexId, true, NULL,
+ 1, &skey);
+ if (HeapTupleIsValid(systable_getnext(scan)) &&
+ attachRel->rd_rel->relkind == RELKIND_RELATION)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach inheritance parent as partition")));
+ systable_endscan(scan);
+ heap_close(catalog, AccessShareLock);
+
+ /*
+ * Prevent circularity by seeing if rel is a partition of attachRel. (In
+ * particular, this disallows making a rel a partition of itself.)
+ */
+ childrels = find_all_inheritors(RelationGetRelid(attachRel),
+ AccessShareLock, NULL);
+ if (list_member_oid(childrels, RelationGetRelid(rel)))
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_TABLE),
+ errmsg("circular inheritance not allowed"),
+ errdetail("\"%s\" is already a child of \"%s\".",
+ RelationGetRelationName(rel),
+ RelationGetRelationName(attachRel))));
+
+ /* Temp parent cannot have a partition that is itself not a temp */
+ if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+ attachRel->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach a permanent relation as partition of temporary relation \"%s\"",
+ RelationGetRelationName(rel))));
+
+ /* If the parent is temp, it must belong to this session */
+ if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+ !rel->rd_islocaltemp)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach as partition of temporary relation of another session")));
+
+ /* Ditto for the partition */
+ if (attachRel->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
+ !attachRel->rd_islocaltemp)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach temporary relation of another session as partition")));
+
+ /* If parent has OIDs then child must have OIDs */
+ if (rel->rd_rel->relhasoids && !attachRel->rd_rel->relhasoids)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach table \"%s\" without OIDs as partition of"
+ " table \"%s\" with OIDs", RelationGetRelationName(attachRel),
+ RelationGetRelationName(rel))));
+
+ /* OTOH, if parent doesn't have them, do not allow in attachRel either */
+ if (attachRel->rd_rel->relhasoids && !rel->rd_rel->relhasoids)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("cannot attach table \"%s\" with OIDs as partition of table"
+ " \"%s\" without OIDs", RelationGetRelationName(attachRel),
+ RelationGetRelationName(rel))));
+
+ /* Check if there are any columns in attachRel that aren't in the parent */
+ tupleDesc = RelationGetDescr(attachRel);
+ natts = tupleDesc->natts;
+ for (attno = 1; attno <= natts; attno++)
+ {
+ Form_pg_attribute attribute = tupleDesc->attrs[attno - 1];
+ char *attributeName = NameStr(attribute->attname);
+
+ /* Ignore dropped */
+ if (attribute->attisdropped)
+ continue;
+
+ /* Try to find the column in parent (matching on column name) */
+ if (!SearchSysCacheExists2(ATTNAME,
+ ObjectIdGetDatum(RelationGetRelid(rel)),
+ CStringGetDatum(attributeName)))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("table \"%s\" contains column \"%s\" not found in parent \"%s\"",
+ RelationGetRelationName(attachRel), attributeName,
+ RelationGetRelationName(rel)),
+ errdetail("New partition should contain only the columns present in parent.")));
+ }
+
+ /* OK to create inheritance. Rest of the checks performed there */
+ CreateInheritance(attachRel, rel);
+
+ /*
+ * Check that the new partition's bound is valid and does not overlap any
+ * of existing partitions of the parent - note that it does not return on
+ * error.
+ */
+ check_new_partition_bound(RelationGetRelationName(attachRel), rel,
+ cmd->bound);
+
+ /* Update the pg_class entry. */
+ StorePartitionBound(attachRel, rel, cmd->bound);
+
+ /*
+ * Generate partition constraint from the partition bound specification.
+ * If the parent itself is a partition, make sure to include its
+ * constraint as well.
+ */
+ partConstraint = list_concat(get_qual_from_partbound(attachRel, rel,
+ cmd->bound),
+ RelationGetPartitionQual(rel));
+ partConstraint = (List *) eval_const_expressions(NULL,
+ (Node *) partConstraint);
+ partConstraint = (List *) canonicalize_qual((Expr *) partConstraint);
+ partConstraint = list_make1(make_ands_explicit(partConstraint));
+
+ /*
+ * Check if we can do away with having to scan the table being attached to
+ * validate the partition constraint, by *proving* that the existing
+ * constraints of the table *imply* the partition predicate. We include
+ * the table's check constraints and NOT NULL constraints in the list of
+ * clauses passed to predicate_implied_by().
+ *
+ * There is a case in which we cannot rely on just the result of the
+ * proof.
+ */
+ attachRel_constr = tupleDesc->constr;
+ existConstraint = NIL;
+ if (attachRel_constr != NULL)
+ {
+ int num_check = attachRel_constr->num_check;
+ int i;
+ Bitmapset *not_null_attrs = NULL;
+ List *part_constr;
+ ListCell *lc;
+ bool partition_accepts_null = true;
+ int partnatts;
+
+ if (attachRel_constr->has_not_null)
+ {
+ int natts = attachRel->rd_att->natts;
+
+ for (i = 1; i <= natts; i++)
+ {
+ Form_pg_attribute att = attachRel->rd_att->attrs[i - 1];
+
+ if (att->attnotnull && !att->attisdropped)
+ {
+ NullTest *ntest = makeNode(NullTest);
+
+ ntest->arg = (Expr *) makeVar(1,
+ i,
+ att->atttypid,
+ att->atttypmod,
+ att->attcollation,
+ 0);
+ ntest->nulltesttype = IS_NOT_NULL;
+
+ /*
+ * argisrow=false is correct even for a composite column,
+ * because attnotnull does not represent a SQL-spec IS NOT
+ * NULL test in such a case, just IS DISTINCT FROM NULL.
+ */
+ ntest->argisrow = false;
+ ntest->location = -1;
+ existConstraint = lappend(existConstraint, ntest);
+ not_null_attrs = bms_add_member(not_null_attrs, i);
+ }
+ }
+ }
+
+ for (i = 0; i < num_check; i++)
+ {
+ Node *cexpr;
+
+ /*
+ * If this constraint hasn't been fully validated yet, we must
+ * ignore it here.
+ */
+ if (!attachRel_constr->check[i].ccvalid)
+ continue;
+
+ cexpr = stringToNode(attachRel_constr->check[i].ccbin);
+
+ /*
+ * Run each expression through const-simplification and
+ * canonicalization. It is necessary, because we will be
+ * comparing it to similarly-processed qual clauses, and may fail
+ * to detect valid matches without this.
+ */
+ cexpr = eval_const_expressions(NULL, cexpr);
+ cexpr = (Node *) canonicalize_qual((Expr *) cexpr);
+
+ existConstraint = list_concat(existConstraint,
+ make_ands_implicit((Expr *) cexpr));
+ }
+
+ existConstraint = list_make1(make_ands_explicit(existConstraint));
+
+ /* And away we go ... */
+ if (predicate_implied_by(partConstraint, existConstraint))
+ skip_validate = true;
+
+ /*
+ * We choose to err on the safer side, i.e., give up on skipping the
+ * validation scan, if the partition key column doesn't have the NOT
+ * NULL constraint and the table is to become a list partition that
+ * does not accept nulls. In this case, the partition predicate
+ * (partConstraint) does include an 'key IS NOT NULL' expression,
+ * however, because of the way predicate_implied_by_simple_clause() is
+ * designed to handle IS NOT NULL predicates in the absence of a IS
+ * NOT NULL clause, we cannot rely on just the above proof.
+ *
+ * That is not an issue in case of a range partition, because if there
+ * were no NOT NULL constraint defined on the key columns, an error
+ * would be thrown before we get here anyway. That is not true,
+ * however, if any of the partition keys is an expression, which is
+ * handled below.
+ */
+ part_constr = linitial(partConstraint);
+ part_constr = make_ands_implicit((Expr *) part_constr);
+
+ /*
+ * part_constr contains an IS NOT NULL expression, if this is a list
+ * partition that does not accept nulls (in fact, also if this is a
+ * range partition and some partition key is an expression, but we
+ * never skip validation in that case anyway; see below)
+ */
+ foreach(lc, part_constr)
+ {
+ Node *expr = lfirst(lc);
+
+ if (IsA(expr, NullTest) &&
+ ((NullTest *) expr)->nulltesttype == IS_NOT_NULL)
+ {
+ partition_accepts_null = false;
+ break;
+ }
+ }
+
+ partnatts = get_partition_natts(key);
+ for (i = 0; i < partnatts; i++)
+ {
+ AttrNumber partattno;
+
+ partattno = get_partition_col_attnum(key, i);
+
+ /* If partition key is an expression, must not skip validation */
+ if (!partition_accepts_null &&
+ (partattno == 0 ||
+ !bms_is_member(partattno, not_null_attrs)))
+ skip_validate = false;
+ }
+ }
+
+ /* It's safe to skip the validation scan after all */
+ if (skip_validate)
+ ereport(INFO,
+ (errmsg("partition constraint for table \"%s\" is implied by existing constraints",
+ RelationGetRelationName(attachRel))));
+
+ /*
+ * Set up to have the table be scanned to validate the partition
+ * constraint (see partConstraint above). If it's a partitioned table, we
+ * instead schedule its leaf partitions to be scanned.
+ */
+ if (!skip_validate)
+ {
+ List *all_parts;
+ ListCell *lc;
+
+ /* Take an exclusive lock on the partitions to be checked */
+ if (attachRel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+ all_parts = find_all_inheritors(RelationGetRelid(attachRel),
+ AccessExclusiveLock, NULL);
+ else
+ all_parts = list_make1_oid(RelationGetRelid(attachRel));
+
+ foreach(lc, all_parts)
+ {
+ AlteredTableInfo *tab;
+ Oid part_relid = lfirst_oid(lc);
+ Relation part_rel;
+ Expr *constr;
+
+ /* Lock already taken */
+ if (part_relid != RelationGetRelid(attachRel))
+ part_rel = heap_open(part_relid, NoLock);
+ else
+ part_rel = attachRel;
+
+ /*
+ * Skip if it's a partitioned table. Only RELKIND_RELATION
+ * relations (ie, leaf partitions) need to be scanned.
+ */
+ if (part_rel != attachRel &&
+ part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+ {
+ heap_close(part_rel, NoLock);
+ continue;
+ }
+
+ /* Grab a work queue entry */
+ tab = ATGetQueueEntry(wqueue, part_rel);
+
+ /* Adjust constraint to match this partition */
+ constr = linitial(partConstraint);
+ tab->partition_constraint = (Expr *)
+ map_partition_varattnos((List *) constr, 1,
+ part_rel, rel);
+ /* keep our lock until commit */
+ if (part_rel != attachRel)
+ heap_close(part_rel, NoLock);
+ }
+ }
+
+ ObjectAddressSet(address, RelationRelationId, RelationGetRelid(attachRel));
+
+ /* keep our lock until commit */
+ heap_close(attachRel, NoLock);
+
+ return address;
+ }
+
+ /*
+ * ALTER TABLE DETACH PARTITION
+ *
+ * Return the address of the relation that is no longer a partition of rel.
+ */
+ static ObjectAddress
+ ATExecDetachPartition(Relation rel, RangeVar *name)
+ {
+ Relation partRel,
+ classRel;
+ HeapTuple tuple,
+ newtuple;
+ Datum new_val[Natts_pg_class];
+ bool isnull,
+ new_null[Natts_pg_class],
+ new_repl[Natts_pg_class];
+ ObjectAddress address;
+
+ partRel = heap_openrv(name, AccessShareLock);
+
+ /* All inheritance related checks are performed within the function */
+ RemoveInheritance(partRel, rel);
+
+ /* Update pg_class tuple */
+ classRel = heap_open(RelationRelationId, RowExclusiveLock);
+ tuple = SearchSysCacheCopy1(RELOID,
+ ObjectIdGetDatum(RelationGetRelid(partRel)));
+ Assert(((Form_pg_class) GETSTRUCT(tuple))->relispartition);
+
+ (void) SysCacheGetAttr(RELOID, tuple, Anum_pg_class_relpartbound,
+ &isnull);
+ Assert(!isnull);
+
+ /* Clear relpartbound and reset relispartition */
+ memset(new_val, 0, sizeof(new_val));
+ memset(new_null, false, sizeof(new_null));
+ memset(new_repl, false, sizeof(new_repl));
+ new_val[Anum_pg_class_relpartbound - 1] = (Datum) 0;
+ new_null[Anum_pg_class_relpartbound - 1] = true;
+ new_repl[Anum_pg_class_relpartbound - 1] = true;
+ newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel),
+ new_val, new_null, new_repl);
+
+ ((Form_pg_class) GETSTRUCT(newtuple))->relispartition = false;
+ CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple);
+ heap_freetuple(newtuple);
+ heap_close(classRel, RowExclusiveLock);
+
+ /*
+ * Invalidate the parent's relcache so that the partition is no longer
+ * included in its partition descriptor.
+ */
+ CacheInvalidateRelcache(rel);
+
+ ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partRel));
+
+ /* keep our lock until commit */
+ heap_close(partRel, NoLock);
+
+ return address;
+ }
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/tqual.h"
+#ifdef PGXC
+#include "pgxc/execRemote.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
+#endif
+ #include "utils/varlena.h"
/* GUC variables */
* trigger.c
* PostgreSQL TRIGGERs support code.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
fkcon->skip_validation = false;
fkcon->initially_valid = true;
+ /* finally, wrap it in a dummy PlannedStmt */
+ wrapper->commandType = CMD_UTILITY;
+ wrapper->canSetTag = false;
+ wrapper->utilityStmt = (Node *) atstmt;
+ wrapper->stmt_location = -1;
+ wrapper->stmt_len = -1;
+
/* ... and execute it */
- ProcessUtility((Node *) atstmt,
+ ProcessUtility(wrapper,
"(generated ALTER TABLE ADD FOREIGN KEY command)",
- PROCESS_UTILITY_SUBCOMMAND, NULL,
+ PROCESS_UTILITY_SUBCOMMAND, NULL, NULL,
- None_Receiver, NULL);
+ None_Receiver,
- #ifdef PGXC
+ false,
- #endif /* PGXC */
+ NULL);
/* Remove the matched item from the list */
info_list = list_delete_ptr(info_list, info);
* in cluster.c.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
* Routines for handling specialized SET variables.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* view.c
* use rewrite rules to construct views
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
- OBJS = execAmi.o execCurrent.o execGrouping.o execIndexing.o execJunk.o \
- execMain.o execParallel.o execProcnode.o execQual.o \
- execScan.o execTuples.o \
+ OBJS = execAmi.o execCurrent.o execExpr.o execExprInterp.o \
+ execGrouping.o execIndexing.o execJunk.o \
+ execMain.o execParallel.o execProcnode.o \
+ execReplication.o execScan.o execSRF.o execTuples.o \
execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \
nodeBitmapAnd.o nodeBitmapOr.o \
- nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeCustom.o nodeGather.o \
+ nodeBitmapHeapscan.o nodeBitmapIndexscan.o \
+ nodeCustom.o nodeFunctionscan.o nodeGather.o \
nodeHash.o nodeHashjoin.o nodeIndexscan.o nodeIndexonlyscan.o \
- nodeLimit.o nodeLockRows.o \
+ nodeLimit.o nodeLockRows.o nodeGatherMerge.o \
nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \
- nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \
+ nodeNestloop.o nodeProjectSet.o nodeRecursiveunion.o nodeResult.o \
nodeSamplescan.o nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \
- nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \
+ nodeValuesscan.o \
+ nodeCtescan.o nodeNamedtuplestorescan.o nodeWorktablescan.o \
nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \
- nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o tqueue.o spi.o
- nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o tqueue.o spi.o \
++ nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o tqueue.o spi.o \
+ nodeTableFuncscan.o
include $(top_srcdir)/src/backend/common.mk
* execAmi.c
* miscellaneous executor access method routines
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/executor/execAmi.c
#include "nodes/relation.h"
#include "utils/rel.h"
#include "utils/syscache.h"
+#ifdef PGXC
+#include "pgxc/execRemote.h"
+#endif
- static bool TargetListSupportsBackwardScan(List *targetlist);
static bool IndexSupportsBackwardScan(Oid indexid);
* execCurrent.c
* executor support for WHERE CURRENT OF cursor
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/executor/execCurrent.c
* before ExecutorEnd. This can be omitted only in case of EXPLAIN,
* which should also omit ExecutorRun.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rls.h"
+ #include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/tqual.h"
-
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "commands/copy.h"
+#endif
+#ifdef XCP
+#include "access/gtm.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/poolmgr.h"
+#endif
/* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
ExecutorStart_hook_type ExecutorStart_hook = NULL;
estate->es_param_list_info = queryDesc->params;
if (queryDesc->plannedstmt->nParamExec > 0)
+#ifdef XCP
+ {
+ estate->es_param_exec_vals = (ParamExecData *)
+ palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
+ if (queryDesc->plannedstmt->nParamRemote > 0)
+ {
+ ParamListInfo extparams = estate->es_param_list_info;
+ int i = queryDesc->plannedstmt->nParamRemote;
+ while (--i >= 0 &&
+ queryDesc->plannedstmt->remoteparams[i].paramkind == PARAM_EXEC)
+ {
+ int paramno = queryDesc->plannedstmt->remoteparams[i].paramid;
+ ParamExecData *prmdata;
+
+ Assert(paramno >= 0 &&
+ paramno < queryDesc->plannedstmt->nParamExec);
+ prmdata = &(estate->es_param_exec_vals[paramno]);
+ prmdata->value = extparams->params[i].value;
+ prmdata->isnull = extparams->params[i].isnull;
+ prmdata->ptype = extparams->params[i].ptype;
+ prmdata->done = true;
+ }
+ /*
+ * Truncate exec parameters from the list of received parameters
+ * to avoid sending down duplicates if there are multiple levels
+ * of RemoteSubplan statements
+ */
+ extparams->numParams = i + 1;
+ }
+ }
+#else
estate->es_param_exec_vals = (ParamExecData *)
palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
+#endif
+ estate->es_sourceText = queryDesc->sourceText;
+
+ /*
+ * Fill in the query environment, if any, from queryDesc.
+ */
+ estate->es_queryEnv = queryDesc->queryEnv;
+
/*
* If non-read-only query, set the command ID to mark output tuples with
*/
* ExecProcNode, or ExecEndNode on its subnodes and do the appropriate
* processing.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* This information is needed by routines manipulating tuples
* (getattribute, formtuple, etc.).
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* execUtils.c
* miscellaneous executor utility routines
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
ExecStatus status;
bool setsResult; /* true if this query produces func's result */
bool lazyEval; /* true if should fetch one row at a time */
- Node *stmt; /* PlannedStmt or utility statement */
+ PlannedStmt *stmt; /* plan for this query */
QueryDesc *qd; /* null unless status == RUN */
+ char *src; /* source query resulting in this state */
} execution_state;
{
List *eslist = NIL;
execution_state *lasttages = NULL;
- ListCell *lc1;
+ ListCell *lc1, *lc3;
- foreach(lc1, queryTree_list)
+ forboth(lc1, queryTree_list, lc3, querySource_list)
{
- List *qtlist = (List *) lfirst(lc1);
+ List *qtlist = lfirst_node(List, lc1);
+ char *querysource = (char *) lfirst(lc3);
execution_state *firstes = NULL;
execution_state *preves = NULL;
- ListCell *lc2;
+ ListCell *lc2, *lc4;
foreach(lc2, qtlist)
{
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
/* translator: %s is a SQL statement name */
errmsg("%s is not allowed in a non-volatile function",
- CreateCommandTag(stmt))));
+ CreateCommandTag((Node *) stmt))));
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ if (queryTree->commandType != CMD_UTILITY)
+ {
+ /*
+ * The parameterised queries in RemoteQuery nodes will be prepared
+ * on the Datanode, and need parameter types for the same. Set the
+ * parameter types and their number in all RemoteQuery nodes in the
+ * plan
+ */
+ SetRemoteStatementName(((PlannedStmt *)stmt)->planTree, NULL,
+ fcache->pinfo->nargs,
+ fcache->pinfo->argtypes, 0);
+ }
+ }
+#endif /* PGXC */
+
if (IsInParallelMode() && !CommandIsReadOnly(stmt))
- PreventCommandIfParallelMode(CreateCommandTag(stmt));
+ PreventCommandIfParallelMode(CreateCommandTag((Node *) stmt));
/* OK, build the execution_state for this query */
newes = (execution_state *) palloc(sizeof(execution_state));
queryTree_list = NIL;
flat_query_list = NIL;
- foreach(lc, raw_parsetree_list)
+ forboth(lc, raw_parsetree_list, lc2, querysource_list)
{
- Node *parsetree = (Node *) lfirst(lc);
+ RawStmt *parsetree = lfirst_node(RawStmt, lc);
+ char *querysource = (char *) lfirst(lc2);
List *queryTree_sublist;
queryTree_sublist = pg_analyze_and_rewrite_params(parsetree,
- fcache->src,
+ querysource,
(ParserSetupHook) sql_fn_parser_setup,
- fcache->pinfo);
+ fcache->pinfo,
+ NULL);
queryTree_list = lappend(queryTree_list, queryTree_sublist);
flat_query_list = list_concat(flat_query_list,
list_copy(queryTree_sublist));
else
dest = None_Receiver;
- if (IsA(es->stmt, PlannedStmt))
- es->qd = CreateQueryDesc((PlannedStmt *) es->stmt,
- es->src,
- GetActiveSnapshot(),
- InvalidSnapshot,
- dest,
- fcache->paramLI, 0);
- else
- es->qd = CreateUtilityQueryDesc(es->stmt,
- es->src,
- GetActiveSnapshot(),
- dest,
- fcache->paramLI);
+ es->qd = CreateQueryDesc(es->stmt,
- fcache->src,
++ es->src,
+ GetActiveSnapshot(),
+ InvalidSnapshot,
+ dest,
+ fcache->paramLI,
+ es->qd ? es->qd->queryEnv : NULL,
+ 0);
/* Utility commands don't need Executor. */
- if (es->qd->utilitystmt == NULL)
+ if (es->qd->operation != CMD_UTILITY)
{
/*
* In lazyEval mode, do not let the executor set up an AfterTrigger
{
bool result;
- if (es->qd->utilitystmt)
+ if (es->qd->operation == CMD_UTILITY)
{
- /* ProcessUtility needs the PlannedStmt for DECLARE CURSOR */
- ProcessUtility((es->qd->plannedstmt ?
- (Node *) es->qd->plannedstmt :
- es->qd->utilitystmt),
+ ProcessUtility(es->qd->plannedstmt,
- fcache->src,
+ es->src,
PROCESS_UTILITY_QUERY,
es->qd->params,
+ es->qd->queryEnv,
es->qd->dest,
+#ifdef PGXC
+ false,
+#endif /* PGXC */
NULL);
result = true; /* never stops early */
}
* sensitive to the grouping set for which the aggregate function is
* currently being called.
*
- * TODO: AGG_HASHED doesn't support multiple grouping sets yet.
+ * Plan structure:
+ *
+ * What we get from the planner is actually one "real" Agg node which is
+ * part of the plan tree proper, but which optionally has an additional list
+ * of Agg nodes hung off the side via the "chain" field. This is because an
+ * Agg node happens to be a convenient representation of all the data we
+ * need for grouping sets.
+ *
+ * For many purposes, we treat the "real" node as if it were just the first
+ * node in the chain. The chain must be ordered such that hashed entries
+ * come before sorted/plain entries; the real node is marked AGG_MIXED if
+ * there are both types present (in which case the real node describes one
+ * of the hashed groupings, other AGG_HASHED nodes may optionally follow in
+ * the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node). If
+ * the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
+ * nodes must be of the same type; if it is AGG_PLAIN, there can be no
+ * chained nodes.
+ *
+ * We collect all hashed nodes into a single "phase", numbered 0, and create
+ * a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
+ * Phase 0 is allocated even if there are no hashes, but remains unused in
+ * that case.
+ *
+ * AGG_HASHED nodes actually refer to only a single grouping set each,
+ * because for each hashed grouping we need a separate grpColIdx and
+ * numGroups estimate. AGG_SORTED nodes represent a "rollup", a list of
+ * grouping sets that share a sort order. Each AGG_SORTED node other than
+ * the first one has an associated Sort node which describes the sort order
+ * to be used; the first sorted node takes its input from the outer subtree,
+ * which the planner has already arranged to provide ordered data.
+ *
+ * Memory and ExprContext usage:
+ *
+ * Because we're accumulating aggregate values across input rows, we need to
+ * use more memory contexts than just simple input/output tuple contexts.
+ * In fact, for a rollup, we need a separate context for each grouping set
+ * so that we can reset the inner (finer-grained) aggregates on their group
+ * boundaries while continuing to accumulate values for outer
+ * (coarser-grained) groupings. On top of this, we might be simultaneously
+ * populating hashtables; however, we only need one context for all the
+ * hashtables.
+ *
+ * So we create an array, aggcontexts, with an ExprContext for each grouping
+ * set in the largest rollup that we're going to process, and use the
+ * per-tuple memory context of those ExprContexts to store the aggregate
+ * transition values. hashcontext is the single context created to support
+ * all hash tables.
+ *
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* nodeModifyTable.c
* routines to handle ModifyTable nodes.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* get information on the (current) result relation
*/
resultRelInfo = estate->es_result_relation_info;
+
+ /* Determine the partition to heap_insert the tuple into */
+ if (mtstate->mt_partition_dispatch_info)
+ {
+ int leaf_part_index;
+ TupleConversionMap *map;
+
+ /*
+ * Away we go ... If we end up not finding a partition after all,
+ * ExecFindPartition() does not return and errors out instead.
+ * Otherwise, the returned value is to be used as an index into arrays
+ * mt_partitions[] and mt_partition_tupconv_maps[] that will get us
+ * the ResultRelInfo and TupleConversionMap for the partition,
+ * respectively.
+ */
+ leaf_part_index = ExecFindPartition(resultRelInfo,
+ mtstate->mt_partition_dispatch_info,
+ slot,
+ estate);
+ Assert(leaf_part_index >= 0 &&
+ leaf_part_index < mtstate->mt_num_partitions);
+
+ /*
+ * Save the old ResultRelInfo and switch to the one corresponding to
+ * the selected partition.
+ */
+ saved_resultRelInfo = resultRelInfo;
+ resultRelInfo = mtstate->mt_partitions + leaf_part_index;
+
+ /* We do not yet have a way to insert into a foreign partition */
+ if (resultRelInfo->ri_FdwRoutine)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot route inserted tuples to a foreign table")));
+
+ /* For ExecInsertIndexTuples() to work on the partition's indexes */
+ estate->es_result_relation_info = resultRelInfo;
+
+ /*
+ * We might need to convert from the parent rowtype to the partition
+ * rowtype.
+ */
+ map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
+ if (map)
+ {
+ Relation partrel = resultRelInfo->ri_RelationDesc;
+
+ tuple = do_convert_tuple(tuple, map);
+
+ /*
+ * We must use the partition's tuple descriptor from this point
+ * on, until we're finished dealing with the partition. Use the
+ * dedicated slot for that.
+ */
+ slot = mtstate->mt_partition_tuple_slot;
+ Assert(slot != NULL);
+ ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
+ ExecStoreTuple(tuple, slot, InvalidBuffer, true);
+ }
+ }
+
resultRelationDesc = resultRelInfo->ri_RelationDesc;
-
/*
* If the result relation has OIDs, force the tuple's OID to zero so that
* heap_insert will assign a fresh OID. Usually the OID already will be
prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
econtext,
- &(prm->isnull),
- NULL);
+ &(prm->isnull));
+ prm->done = true;
planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
}
/* ... and to its parent's state */
sstate->parent = parent;
+#ifdef XCP
+ /* subplan is referenced on local node, finish initialization */
+ ExecFinishInitProcNode(sstate->planstate);
+#endif
+
/* Initialize subexpressions */
sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent);
- sstate->args = (List *) ExecInitExpr((Expr *) subplan->args, parent);
+ sstate->args = ExecInitExprList(subplan->args, parent);
/*
* initialize my state
prm->value = ExecEvalExprSwitchContext((ExprState *) lfirst(pvar),
econtext,
- &(prm->isnull),
- NULL);
+ &(prm->isnull));
+ prm->done = true;
planstate->chgParam = bms_add_member(planstate->chgParam, paramid);
}
* aggregate function over all rows in the current row's window frame.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
}
- /* Pushes SPI stack to allow recursive SPI calls */
- void
- SPI_push(void)
- {
- _SPI_curid++;
- }
-
- /* Pops SPI stack to allow recursive SPI calls */
- void
- SPI_pop(void)
- {
- _SPI_curid--;
- }
-
- /* Conditional push: push only if we're inside a SPI procedure */
- bool
- SPI_push_conditional(void)
- {
- bool pushed = (_SPI_curid != _SPI_connected);
-
- if (pushed)
- {
- _SPI_curid++;
- /* We should now be in a state where SPI_connect would succeed */
- Assert(_SPI_curid == _SPI_connected);
- }
- return pushed;
- }
-
- /* Conditional pop: pop only if SPI_push_conditional pushed */
- void
- SPI_pop_conditional(bool pushed)
- {
- /* We should be in a state where SPI_connect would succeed */
- Assert(_SPI_curid == _SPI_connected);
- if (pushed)
- _SPI_curid--;
- }
-
- /* Restore state of SPI stack after aborting a subtransaction */
- void
- SPI_restore_connection(void)
- {
- Assert(_SPI_connected >= 0);
- _SPI_curid = _SPI_connected - 1;
- }
-
+#ifdef PGXC
+/* SPI_execute_direct:
+ * Runs the 'remote_sql' query string on the node 'nodename'
+ * Create the ExecDirectStmt parse tree node using remote_sql, and then prepare
+ * and execute it using SPI interface.
+ * This function is essentially used for making internal exec-direct operations;
+ * and this should not require super-user privileges. We cannot run EXEC-DIRECT
+ * query because it is meant only for superusers. So this function needs to
+ * bypass the parse stage. This is achieved here by calling
+ * _SPI_pgxc_prepare_plan which accepts a parse tree.
+ */
+int
+SPI_execute_direct(const char *remote_sql, char *nodename)
+{
+ _SPI_plan plan;
+ int res;
+ ExecDirectStmt *stmt = makeNode(ExecDirectStmt);
+ StringInfoData execdirect;
+
+ initStringInfo(&execdirect);
+
+ /* This string is never used. It is just passed to fill up spierrcontext.arg */
+ appendStringInfo(&execdirect, "EXECUTE DIRECT ON (%s) '%s'",
+ nodename, remote_sql);
+
+ stmt->node_names = list_make1(makeString(nodename));
+ stmt->query = strdup(remote_sql);
+
+ res = _SPI_begin_call(true);
+ if (res < 0)
+ return res;
+
+ memset(&plan, 0, sizeof(_SPI_plan));
+ plan.magic = _SPI_PLAN_MAGIC;
+ plan.cursor_options = 0;
+
+ /* Now pass the ExecDirectStmt parsetree node */
+ _SPI_pgxc_prepare_plan(execdirect.data, list_make1(stmt),
+ list_make1(execdirect.data), &plan);
+
+ res = _SPI_execute_plan(&plan, NULL,
+ InvalidSnapshot, InvalidSnapshot, false, true, 0);
+
+ _SPI_end_call(true);
+ return res;
+}
+#endif
+
/* Parse, plan, and execute a query string */
int
SPI_execute(const char *src, bool read_only, long tcount)
*/
plancache_list = NIL;
- foreach(list_item, raw_parsetree_list)
+ forboth(list_item, raw_parsetree_list, list_item2, querysource_list)
{
- Node *parsetree = (Node *) lfirst(list_item);
+ RawStmt *parsetree = lfirst_node(RawStmt, list_item);
+ char *querysource = (char *) lfirst (list_item2);
List *stmt_list;
CachedPlanSource *plansource;
* needs to see the unmodified raw parse tree.
*/
plansource = CreateCachedPlan(parsetree,
- src,
+ querysource,
+#ifdef PGXC
+ NULL,
+#endif
- CreateCommandTag(parsetree));
+ CreateCommandTag(parsetree->stmt));
/*
* Parameter datatypes are driven by parserSetup hook if provided,
{
Assert(plan->nargs == 0);
stmt_list = pg_analyze_and_rewrite_params(parsetree,
- src,
+ querysource,
plan->parserSetup,
- plan->parserSetupArg);
+ plan->parserSetupArg,
+ _SPI_current->queryEnv);
}
else
{
stmt_list = pg_analyze_and_rewrite(parsetree,
- src,
+ querysource,
plan->argtypes,
- plan->nargs);
+ plan->nargs,
+ _SPI_current->queryEnv);
}
/* Finish filling in the CachedPlanSource */
*/
plancache_list = NIL;
- foreach(list_item, raw_parsetree_list)
+ forboth(list_item, raw_parsetree_list, list_item2, querysource_list)
{
- Node *parsetree = (Node *) lfirst(list_item);
+ RawStmt *parsetree = lfirst_node(RawStmt, list_item);
+ char *querysource = (char *) lfirst (list_item2);
CachedPlanSource *plansource;
++
plansource = CreateOneShotCachedPlan(parsetree,
- src,
+ querysource,
- CreateCommandTag(parsetree));
+ CreateCommandTag(parsetree->stmt));
plancache_list = lappend(plancache_list, plansource);
}
plansource->query_string,
PROCESS_UTILITY_QUERY,
paramLI,
+ _SPI_current->queryEnv,
dest,
+#ifdef PGXC
+ false,
+#endif /* PGXC */
completionTag);
/* Update "processed" if stmt returned tuples */
int bytestowrite;
int totalwritten;
-
- bytestowrite = VARSIZE(wbuf) - VARHDRSZ;
- totalwritten = lo_write(fd, VARDATA(wbuf), bytestowrite);
+#ifdef PGXC
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#endif
+ bytestowrite = VARSIZE_ANY_EXHDR(wbuf);
+ totalwritten = lo_write(fd, VARDATA_ANY(wbuf), bytestowrite);
PG_RETURN_INT32(totalwritten);
}
* be handled easily in a simple depth-first traversal.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* IDENTIFICATION
* src/backend/nodes/copyfuncs.c
COPY_NODE_FIELD(relationOids);
COPY_NODE_FIELD(invalItems);
COPY_SCALAR_FIELD(nParamExec);
+#ifdef XCP
+ COPY_SCALAR_FIELD(nParamRemote);
+ COPY_POINTER_FIELD(remoteparams,
+ newnode->nParamRemote * sizeof(RemoteParam));
+ COPY_STRING_FIELD(pname);
+ COPY_SCALAR_FIELD(distributionType);
+ COPY_SCALAR_FIELD(distributionKey);
+ COPY_NODE_FIELD(distributionNodes);
+ COPY_NODE_FIELD(distributionRestrict);
+#endif
+ COPY_NODE_FIELD(utilityStmt);
+ COPY_LOCATION_FIELD(stmt_location);
+ COPY_LOCATION_FIELD(stmt_len);
return newnode;
}
return newnode;
}
+/* ****************************************************************
+ * poolutils.h copy functions
+ * ****************************************************************
+ */
+static CleanConnStmt *
+_copyCleanConnStmt(const CleanConnStmt *from)
+{
+ CleanConnStmt *newnode = makeNode(CleanConnStmt);
+
+ COPY_NODE_FIELD(nodes);
+ COPY_STRING_FIELD(dbname);
+ COPY_STRING_FIELD(username);
+ COPY_SCALAR_FIELD(is_coord);
+ COPY_SCALAR_FIELD(is_force);
+
+ return newnode;
+}
+#endif
/*
- * copyObject
+ * copyObjectImpl -- implementation of copyObject(); see nodes/nodes.h
*
* Create a copy of a Node tree or list. This is a "deep" copy: all
* substructure is copied too, recursively.
* "x" to be considered equal() to another reference to "x" in the query.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* IDENTIFICATION
* src/backend/nodes/equalfuncs.c
/* we don't bother to copy eref, aliases, etc; OK? */
break;
case RTE_CTE:
+#ifdef PGXC
+ case RTE_REMOTE_DUMMY:
+#endif /* PGXC */
+ case RTE_NAMEDTUPLESTORE:
/* nothing to do */
break;
case RTE_SUBQUERY:
* outfuncs.c
* Output functions for Postgres tree nodes.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
/* Write a bitmapset field */
#define WRITE_BITMAPSET_FIELD(fldname) \
(appendStringInfo(str, " :" CppAsString(fldname) " "), \
- _outBitmapset(str, node->fldname))
+ outBitmapset(str, node->fldname))
- (_outToken(str, OidIsValid((relid)) ? NSP_NAME(get_rel_namespace((relid))) : NULL), \
+#ifdef XCP
+#define NSP_NAME(oid) \
+ isTempNamespace(oid) ? "pg_temp" : get_namespace_name(oid)
+/*
+ * Macros to encode OIDs to send to other nodes. Objects on other nodes may have
+ * different OIDs, so send instead an unique identifier allowing to lookup
+ * the OID on target node. The identifier depends on object type.
+ */
+
+#define WRITE_RELID_INTERNAL(relid) \
- _outToken(str, OidIsValid((relid)) ? get_rel_name((relid)) : NULL))
++ (outToken(str, OidIsValid((relid)) ? NSP_NAME(get_rel_namespace((relid))) : NULL), \
+ appendStringInfoChar(str, ' '), \
- _outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \
++ outToken(str, OidIsValid((relid)) ? get_rel_name((relid)) : NULL))
+
+/* write an OID which is a relation OID */
+#define WRITE_RELID_FIELD(fldname) \
+ (appendStringInfo(str, " :" CppAsString(fldname) " "), \
+ WRITE_RELID_INTERNAL(node->fldname))
+
+#define WRITE_RELID_LIST_FIELD(fldname) \
+ do { \
+ ListCell *lc; \
+ char *sep = ""; \
+ appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ if (node->fldname == NIL || list_length(node->fldname) == 0) \
+ appendStringInfoString(str, "<>"); \
+ else \
+ { \
+ appendStringInfoChar(str, '('); \
+ foreach (lc, node->fldname) \
+ { \
+ Oid relid = lfirst_oid(lc); \
+ appendStringInfoString(str, sep); \
+ WRITE_RELID_INTERNAL(relid); \
+ sep = ","; \
+ } \
+ appendStringInfoChar(str, ')'); \
+ } \
+ } while (0)
+
+/* write an OID which is a data type OID */
+#define WRITE_TYPID_FIELD(fldname) \
+ (appendStringInfo(str, " :" CppAsString(fldname) " "), \
- _outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL))
++ outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \
+ appendStringInfoChar(str, ' '), \
- _outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \
++ outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL))
+
+/* write an OID which is a function OID */
+#define WRITE_FUNCID_FIELD(fldname) \
+ do { \
+ appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ if (OidIsValid(node->fldname)) \
+ { \
+ Oid *argtypes; \
+ int i, nargs; \
- _outToken(str, get_func_name(node->fldname)); \
++ outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \
+ appendStringInfoChar(str, ' '); \
- _outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \
++ outToken(str, get_func_name(node->fldname)); \
+ appendStringInfoChar(str, ' '); \
+ get_func_signature(node->fldname, &argtypes, &nargs); \
+ appendStringInfo(str, "%d", nargs); \
+ for (i = 0; i < nargs; i++) \
+ { \
+ appendStringInfoChar(str, ' '); \
- _outToken(str, get_typ_name(argtypes[i])); \
++ outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \
+ appendStringInfoChar(str, ' '); \
- _outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \
++ outToken(str, get_typ_name(argtypes[i])); \
+ } \
+ } \
+ else \
+ appendStringInfo(str, "<> <> 0"); \
+ } while (0)
+
+/* write an OID which is an operator OID */
+#define WRITE_OPERID_FIELD(fldname) \
+ do { \
+ appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ if (OidIsValid(node->fldname)) \
+ { \
+ Oid oprleft, oprright; \
- _outToken(str, get_opname(node->fldname)); \
++ outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \
+ appendStringInfoChar(str, ' '); \
- _outToken(str, OidIsValid(oprleft) ? \
++ outToken(str, get_opname(node->fldname)); \
+ appendStringInfoChar(str, ' '); \
+ op_input_types(node->fldname, &oprleft, &oprright); \
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \
++ outToken(str, OidIsValid(oprleft) ? \
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL); \
+ appendStringInfoChar(str, ' '); \
- _outToken(str, OidIsValid(oprright) ? \
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \
+ appendStringInfoChar(str, ' '); \
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \
++ outToken(str, OidIsValid(oprright) ? \
+ NSP_NAME(get_typ_namespace(oprright)) : NULL); \
+ appendStringInfoChar(str, ' '); \
- _outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \
+ appendStringInfoChar(str, ' '); \
+ } \
+ else \
+ appendStringInfo(str, "<> <> <> <> <> <>"); \
+ } while (0)
+
+/* write an OID which is a collation OID */
+#define WRITE_COLLID_FIELD(fldname) \
+ do { \
+ appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ if (OidIsValid(node->fldname)) \
+ { \
- _outToken(str, get_collation_name(node->fldname)); \
++ outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \
+ appendStringInfoChar(str, ' '); \
++ outToken(str, get_collation_name(node->fldname)); \
+ appendStringInfo(str, " %d", get_collation_encoding(node->fldname)); \
+ } \
+ else \
+ appendStringInfo(str, "<> <> -1"); \
+ } while (0)
+
+
+#endif
#define booltostr(x) ((x) ? "true" : "false")
}
- _outToken(str, textvalue);
+#ifdef XCP
+/*
+ * Output value in text format
+ */
+static void
+_printDatum(StringInfo str, Datum value, Oid typid)
+{
+ Oid typOutput;
+ bool typIsVarlena;
+ FmgrInfo finfo;
+ Datum tmpval;
+ char *textvalue;
+ int saveDateStyle;
+
+ /* Get output function for the type */
+ getTypeOutputInfo(typid, &typOutput, &typIsVarlena);
+ fmgr_info(typOutput, &finfo);
+
+ /* Detoast value if needed */
+ if (typIsVarlena)
+ tmpval = PointerGetDatum(PG_DETOAST_DATUM(value));
+ else
+ tmpval = value;
+
+ /*
+ * It was found that if configuration setting for date style is
+ * "postgres,ymd" the output dates have format DD-MM-YYYY and they can not
+ * be parsed correctly by receiving party. So force ISO format YYYY-MM-DD
+ * in internal cluster communications, these values are always parsed
+ * correctly.
+ */
+ saveDateStyle = DateStyle;
+ DateStyle = USE_ISO_DATES;
+
+ textvalue = DatumGetCString(FunctionCall1(&finfo, tmpval));
++ outToken(str, textvalue);
+
+ DateStyle = saveDateStyle;
+}
+#endif
+
+
/*
* Stuff from plannodes.h
*/
appendStringInfoString(str, " :sortOperators");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->sortOperators[i];
+ Oid oprleft, oprright;
+ /* Sort operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->sortOperators[i]);
appendStringInfoString(str, " :collations");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid coll = node->collations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_collation_name(coll));
++ outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
++ outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->collations[i]);
appendStringInfoString(str, " :nullsFirst");
appendStringInfoString(str, " :dupOperators");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->dupOperators[i];
+ Oid oprleft, oprright;
+ /* Unique operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->dupOperators[i]);
WRITE_LONG_FIELD(numGroups);
_outScanInfo(str, (const Scan *) node);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(indexid);
+ else
+#endif
WRITE_OID_FIELD(indexid);
+ WRITE_BOOL_FIELD(isshared);
WRITE_NODE_FIELD(indexqual);
WRITE_NODE_FIELD(indexqualorig);
}
appendStringInfoString(str, " :mergeCollations");
for (i = 0; i < numCols; i++)
- _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid coll = node->mergeCollations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_collation_name(coll));
++ outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
++ outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->mergeCollations[i]);
appendStringInfoString(str, " :mergeStrategies");
appendStringInfoString(str, " :grpOperators");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->grpOperators[i];
+ Oid oprleft, oprright;
+ /* Group operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->grpOperators[i]);
WRITE_LONG_FIELD(numGroups);
for (i = 0; i < node->partNumCols; i++)
appendStringInfo(str, " %d", node->partColIdx[i]);
- appendStringInfoString(str, " :partOperations");
+ appendStringInfoString(str, " :partOperators");
for (i = 0; i < node->partNumCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->partOperators[i];
+ Oid oprleft, oprright;
+ /* The operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->partOperators[i]);
WRITE_INT_FIELD(ordNumCols);
for (i = 0; i < node->ordNumCols; i++)
appendStringInfo(str, " %d", node->ordColIdx[i]);
- appendStringInfoString(str, " :ordOperations");
+ appendStringInfoString(str, " :ordOperators");
for (i = 0; i < node->ordNumCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->ordOperators[i];
+ Oid oprleft, oprright;
+ /* Group operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->ordOperators[i]);
WRITE_INT_FIELD(frameOptions);
appendStringInfoString(str, " :grpOperators");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->grpOperators[i];
+ Oid oprleft, oprright;
+ /* Group operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->grpOperators[i]);
}
appendStringInfoString(str, " :sortOperators");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->sortOperators[i];
+ Oid oprleft, oprright;
+ /* Sort operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->sortOperators[i]);
appendStringInfoString(str, " :collations");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid coll = node->collations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_collation_name(coll));
++ outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
++ outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->collations[i]);
appendStringInfoString(str, " :nullsFirst");
appendStringInfoString(str, " :uniqOperators");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->uniqOperators[i];
+ Oid oprleft, oprright;
+ /* Unique operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->uniqOperators[i]);
}
appendStringInfoString(str, " :dupOperators");
for (i = 0; i < node->numCols; i++)
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->dupOperators[i];
+ Oid oprleft, oprright;
+ /* Unique operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->dupOperators[i]);
WRITE_INT_FIELD(flagColIdx);
WRITE_NODE_FIELD(limitCount);
}
- _outToken(str, NSP_NAME(get_typ_namespace(ptype)));
+#ifdef XCP
+static void
+_outRemoteSubplan(StringInfo str, const RemoteSubplan *node)
+{
+ WRITE_NODE_TYPE("REMOTESUBPLAN");
+
+ _outScanInfo(str, (Scan *) node);
+
+ WRITE_CHAR_FIELD(distributionType);
+ WRITE_INT_FIELD(distributionKey);
+ WRITE_NODE_FIELD(distributionNodes);
+ WRITE_NODE_FIELD(distributionRestrict);
+ WRITE_NODE_FIELD(nodeList);
+ WRITE_BOOL_FIELD(execOnAll);
+ WRITE_NODE_FIELD(sort);
+ WRITE_STRING_FIELD(cursor);
+ WRITE_INT_FIELD(unique);
+}
+
+static void
+_outRemoteStmt(StringInfo str, const RemoteStmt *node)
+{
+ int i;
+
+ WRITE_NODE_TYPE("REMOTESTMT");
+
+ WRITE_ENUM_FIELD(commandType, CmdType);
+ WRITE_BOOL_FIELD(hasReturning);
+ WRITE_NODE_FIELD(planTree);
+ WRITE_NODE_FIELD(rtable);
+ WRITE_NODE_FIELD(resultRelations);
+ WRITE_NODE_FIELD(subplans);
+ WRITE_INT_FIELD(nParamExec);
+ WRITE_INT_FIELD(nParamRemote);
+
+ for (i = 0; i < node->nParamRemote; i++)
+ {
+ RemoteParam *rparam = &(node->remoteparams[i]);
+ appendStringInfo(str, " :paramkind");
+ appendStringInfo(str, " %d", (int) rparam->paramkind);
+
+ appendStringInfo(str, " :paramid");
+ appendStringInfo(str, " %d", rparam->paramid);
+
+ appendStringInfo(str, " :paramused");
+ appendStringInfo(str, " %d", rparam->paramused);
+
+ appendStringInfo(str, " :paramtype");
+ if (portable_output)
+ {
+ Oid ptype = rparam->paramtype;
+ Assert(OidIsValid(ptype));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_typ_name(ptype));
++ outToken(str, NSP_NAME(get_typ_namespace(ptype)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, NSP_NAME(get_opnamespace(oper)));
++ outToken(str, get_typ_name(ptype));
+ }
+ else
+ appendStringInfo(str, " %u", rparam->paramtype);
+ }
+ WRITE_NODE_FIELD(rowMarks);
+ WRITE_CHAR_FIELD(distributionType);
+ WRITE_INT_FIELD(distributionKey);
+ WRITE_NODE_FIELD(distributionNodes);
+ WRITE_NODE_FIELD(distributionRestrict);
+}
+
+static void
+_outSimpleSort(StringInfo str, const SimpleSort *node)
+{
+ int i;
+
+ WRITE_NODE_TYPE("SIMPLESORT");
+
+ WRITE_INT_FIELD(numCols);
+
+ appendStringInfo(str, " :sortColIdx");
+ for (i = 0; i < node->numCols; i++)
+ appendStringInfo(str, " %d", node->sortColIdx[i]);
+
+ appendStringInfo(str, " :sortOperators");
+ for (i = 0; i < node->numCols; i++)
+ if (portable_output)
+ {
+ Oid oper = node->sortOperators[i];
+ Oid oprleft, oprright;
+ /* Sort operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_opname(oper));
++ outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprleft) ?
++ outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
- _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
++ outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ?
++ outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
++ outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
- _outToken(str, NSP_NAME(get_collation_namespace(coll)));
++ outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ }
+ else
+ appendStringInfo(str, " %u", node->sortOperators[i]);
+
+ appendStringInfo(str, " :sortCollations");
+ for (i = 0; i < node->numCols; i++)
+ if (portable_output)
+ {
+ Oid coll = node->sortCollations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
- _outToken(str, get_collation_name(coll));
++ outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
++ outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+ appendStringInfo(str, " %u", node->sortCollations[i]);
+
+ appendStringInfo(str, " :nullsFirst");
+ for (i = 0; i < node->numCols; i++)
+ appendStringInfo(str, " %s", booltostr(node->nullsFirst[i]));
+}
+#endif
+
static void
_outNestLoopParam(StringInfo str, const NestLoopParam *node)
{
WRITE_STRING_FIELD(ctename);
WRITE_UINT_FIELD(ctelevelsup);
WRITE_BOOL_FIELD(self_reference);
- WRITE_NODE_FIELD(ctecoltypes);
- WRITE_NODE_FIELD(ctecoltypmods);
- WRITE_NODE_FIELD(ctecolcollations);
+ WRITE_NODE_FIELD(coltypes);
+ WRITE_NODE_FIELD(coltypmods);
+ WRITE_NODE_FIELD(colcollations);
+ break;
+ case RTE_NAMEDTUPLESTORE:
+ WRITE_STRING_FIELD(enrname);
+ WRITE_OID_FIELD(relid);
+ WRITE_NODE_FIELD(coltypes);
+ WRITE_NODE_FIELD(coltypmods);
+ WRITE_NODE_FIELD(colcollations);
break;
+#ifdef PGXC
+ case RTE_REMOTE_DUMMY:
+ /* Everything relevant already copied */
+ break;
+#endif /* PGXC */
default:
elog(ERROR, "unrecognized RTE kind: %d", (int) node->rtekind);
break;
case T_ForeignKeyCacheInfo:
_outForeignKeyCacheInfo(str, obj);
break;
+#ifdef PGXC
+ case T_ExecNodes:
+ _outExecNodes(str, obj);
+ break;
+#endif
+ case T_TriggerTransition:
+ _outTriggerTransition(str, obj);
+ break;
+ case T_PartitionElem:
+ _outPartitionElem(str, obj);
+ break;
+ case T_PartitionSpec:
+ _outPartitionSpec(str, obj);
+ break;
+ case T_PartitionBoundSpec:
+ _outPartitionBoundSpec(str, obj);
+ break;
+ case T_PartitionRangeDatum:
+ _outPartitionRangeDatum(str, obj);
+ break;
default:
* readfuncs.c
* Reader functions for Postgres tree nodes.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
READ_STRING_FIELD(ctename);
READ_UINT_FIELD(ctelevelsup);
READ_BOOL_FIELD(self_reference);
- READ_NODE_FIELD(ctecoltypes);
- READ_NODE_FIELD(ctecoltypmods);
- READ_NODE_FIELD(ctecolcollations);
+ READ_NODE_FIELD(coltypes);
+ READ_NODE_FIELD(coltypmods);
+ READ_NODE_FIELD(colcollations);
+ break;
+ case RTE_NAMEDTUPLESTORE:
+ READ_STRING_FIELD(enrname);
+ READ_OID_FIELD(relid);
+ READ_NODE_FIELD(coltypes);
+ READ_NODE_FIELD(coltypmods);
+ READ_NODE_FIELD(colcollations);
break;
+#ifdef PGXC
+ case RTE_REMOTE_DUMMY:
+ /* Nothing to do */
+ break;
+#endif /* PGXC */
default:
elog(ERROR, "unrecognized RTE kind: %d",
(int) local_node->rtekind);
ReadCommonScan(&local_node->scan);
- READ_OID_FIELD(indexid);
+ if (portable_input)
+ READ_RELID_FIELD(indexid);
+ else
+ READ_OID_FIELD(indexid);
+ READ_BOOL_FIELD(isshared);
READ_NODE_FIELD(indexqual);
READ_NODE_FIELD(indexqualorig);
READ_ENUM_FIELD(aggsplit, AggSplit);
READ_INT_FIELD(numCols);
READ_ATTRNUMBER_ARRAY(grpColIdx, local_node->numCols);
+
+#ifdef PGXC
+ token = pg_strtok(&length); /* skip :grpOperators */
+ local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->grpOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->grpOperators[i] = atooid(token);
+ }
+#else
READ_OID_ARRAY(grpOperators, local_node->numCols);
+#endif
+
READ_LONG_FIELD(numGroups);
+ READ_BITMAPSET_FIELD(aggParams);
READ_NODE_FIELD(groupingSets);
READ_NODE_FIELD(chain);
ReadCommonPlan(&local_node->plan);
- READ_OID_FIELD(skewTable);
+ if (portable_input)
+ READ_RELID_FIELD(skewTable);
+ else
+ READ_OID_FIELD(skewTable);
READ_INT_FIELD(skewColumn);
READ_BOOL_FIELD(skewInherit);
- if (portable_input)
- READ_TYPID_FIELD(skewColType);
- else
- READ_OID_FIELD(skewColType);
- READ_INT_FIELD(skewColTypmod);
READ_DONE();
}
READ_NODE_FIELD(paramIds);
READ_INT_FIELD(plan_id);
READ_STRING_FIELD(plan_name);
- READ_OID_FIELD(firstColType);
+ if (portable_input)
+ READ_TYPID_FIELD(firstColType);
+ else
+ READ_OID_FIELD(firstColType);
READ_INT_FIELD(firstColTypmod);
- READ_OID_FIELD(firstColCollation);
+ if (portable_input)
+ READ_COLLID_FIELD(firstColCollation);
+ else
+ READ_OID_FIELD(firstColCollation);
READ_BOOL_FIELD(useHashTable);
READ_BOOL_FIELD(unknownEqFalse);
+ READ_BOOL_FIELD(parallel_safe);
READ_NODE_FIELD(setParam);
READ_NODE_FIELD(parParam);
READ_NODE_FIELD(args);
READ_DONE();
}
+
+/*
+ * _readRemoteSubplan
+ */
+static RemoteSubplan *
+_readRemoteSubplan(void)
+{
+ READ_SCAN_FIELDS(RemoteSubplan);
+
+ READ_CHAR_FIELD(distributionType);
+ READ_INT_FIELD(distributionKey);
+ READ_NODE_FIELD(distributionNodes);
+ READ_NODE_FIELD(distributionRestrict);
+ READ_NODE_FIELD(nodeList);
+ READ_BOOL_FIELD(execOnAll);
+ READ_NODE_FIELD(sort);
+ READ_STRING_FIELD(cursor);
+ READ_INT_FIELD(unique);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readRemoteStmt
+ */
+static RemoteStmt *
+_readRemoteStmt(void)
+{
+ int i;
+ READ_LOCALS(RemoteStmt);
+
+ READ_ENUM_FIELD(commandType, CmdType);
+ READ_BOOL_FIELD(hasReturning);
+ READ_NODE_FIELD(planTree);
+ READ_NODE_FIELD(rtable);
+ READ_NODE_FIELD(resultRelations);
+ READ_NODE_FIELD(subplans);
+ READ_INT_FIELD(nParamExec);
+ READ_INT_FIELD(nParamRemote);
+ if (local_node->nParamRemote > 0)
+ {
+ local_node->remoteparams = (RemoteParam *) palloc(
+ local_node->nParamRemote * sizeof(RemoteParam));
+ for (i = 0; i < local_node->nParamRemote; i++)
+ {
+ RemoteParam *rparam = &(local_node->remoteparams[i]);
+ token = pg_strtok(&length); /* skip :paramkind */
+ token = pg_strtok(&length);
+ rparam->paramkind = (ParamKind) atoi(token);
+
+ token = pg_strtok(&length); /* skip :paramid */
+ token = pg_strtok(&length);
+ rparam->paramid = atoi(token);
+
+ token = pg_strtok(&length); /* skip :paramused */
+ token = pg_strtok(&length);
+ rparam->paramused = atoi(token);
+
+ token = pg_strtok(&length); /* skip :paramtype */
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *typname; /* data type name */
+ token = pg_strtok(&length); /* get nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get typname */
+ typname = nullable_string(token, length);
+ if (typname)
+ rparam->paramtype = get_typname_typid(typname,
+ NSP_OID(nspname));
+ else
+ rparam->paramtype = InvalidOid;
+ }
+ else
+ {
+ token = pg_strtok(&length);
+ rparam->paramtype = atooid(token);
+ }
+ }
+ }
+ else
+ local_node->remoteparams = NULL;
+
+ READ_NODE_FIELD(rowMarks);
+ READ_CHAR_FIELD(distributionType);
+ READ_INT_FIELD(distributionKey);
+ READ_NODE_FIELD(distributionNodes);
+ READ_NODE_FIELD(distributionRestrict);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSimpleSort
+ */
+static SimpleSort *
+_readSimpleSort(void)
+{
+ int i;
+ READ_LOCALS(SimpleSort);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :sortColIdx */
+ local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->sortColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortOperators */
+ local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->sortOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->sortOperators[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortCollations */
+ local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->sortCollations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->sortCollations[i] = InvalidOid;
+ }
+ else
+ local_node->sortCollations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :nullsFirst */
+ local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->nullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+ /*
+ * _readPartitionBoundSpec
+ */
+ static PartitionBoundSpec *
+ _readPartitionBoundSpec(void)
+ {
+ READ_LOCALS(PartitionBoundSpec);
+
+ READ_CHAR_FIELD(strategy);
+ READ_NODE_FIELD(listdatums);
+ READ_NODE_FIELD(lowerdatums);
+ READ_NODE_FIELD(upperdatums);
+ /* XXX somebody forgot location field; too late to change for v10 */
+ local_node->location = -1;
+
+ READ_DONE();
+ }
+
+ /*
+ * _readPartitionRangeDatum
+ */
+ static PartitionRangeDatum *
+ _readPartitionRangeDatum(void)
+ {
+ READ_LOCALS(PartitionRangeDatum);
+
+ READ_BOOL_FIELD(infinite);
+ READ_NODE_FIELD(value);
+ /* XXX somebody forgot location field; too late to change for v10 */
+ local_node->location = -1;
+
+ READ_DONE();
+ }
+
/*
* parseNodeString
*
return_value = _readAlternativeSubPlan();
else if (MATCH("EXTENSIBLENODE", 14))
return_value = _readExtensibleNode();
+ else if (MATCH("REMOTESUBPLAN", 13))
+ return_value = _readRemoteSubplan();
+ else if (MATCH("REMOTESTMT", 10))
+ return_value = _readRemoteStmt();
+ else if (MATCH("SIMPLESORT", 10))
+ return_value = _readSimpleSort();
+ else if (MATCH("PARTITIONBOUND", 14))
+ return_value = _readPartitionBoundSpec();
+ else if (MATCH("PARTRANGEDATUM", 14))
+ return_value = _readPartitionRangeDatum();
else
{
elog(ERROR, "badly formatted node string \"%.32s\"...", token);
* allpaths.c
* Routines to find possible search paths for processing a query
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* executed only once.
*/
return;
+
+ case RTE_NAMEDTUPLESTORE:
+
+ /*
+ * tuplestore cannot be shared, at least without more
+ * infrastructure to support that.
+ */
+ return;
++
++ case RTE_REMOTE_DUMMY:
++ return;
}
/*
/* Generate a partial append path. */
appendpath = create_append_path(rel, partial_subpaths, NULL,
- parallel_workers);
+ parallel_workers, partitioned_rels);
- add_partial_path(rel, (Path *) appendpath);
+
+ /*
+ * XL: In case we had to re-distribute the child relations, don't
+ * do anything. Otherwise create_gather_path hits an Assert etc.
+ */
+ if (appendpath->path.parallel_safe)
+ add_partial_path(rel, (Path *) appendpath);
}
/*
* values.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
bool enable_material = true;
bool enable_mergejoin = true;
bool enable_hashjoin = true;
+bool enable_fast_query_shipping = true;
+ bool enable_gathermerge = true;
typedef struct
{
return ceil(relation_byte_size(tuples, width) / BLCKSZ);
}
-
+#ifdef XCP
+void
+cost_remote_subplan(Path *path,
+ Cost input_startup_cost, Cost input_total_cost,
+ double tuples, int width, int replication)
+{
+ Cost startup_cost = input_startup_cost + remote_query_cost;
+ Cost run_cost = input_total_cost - input_startup_cost;
+
+ path->rows = tuples;
+
+ /*
+ * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead.
+ */
+ run_cost += 2 * cpu_operator_cost * tuples;
+
+ /*
+ * Estimate cost of sending data over network
+ */
+ run_cost += network_byte_cost * tuples * width * replication;
+
+ path->startup_cost = startup_cost;
+ path->total_cost = startup_cost + run_cost;
+}
+#endif
++
+ /*
+ * Estimate the fraction of the work that each worker will do given the
+ * number of workers budgeted for the path.
+ */
+ static double
+ get_parallel_divisor(Path *path)
+ {
+ double parallel_divisor = path->parallel_workers;
+ double leader_contribution;
+
+ /*
+ * Early experience with parallel query suggests that when there is only
+ * one worker, the leader often makes a very substantial contribution to
+ * executing the parallel portion of the plan, but as more workers are
+ * added, it does less and less, because it's busy reading tuples from the
+ * workers and doing whatever non-parallel post-processing is needed. By
+ * the time we reach 4 workers, the leader no longer makes a meaningful
+ * contribution. Thus, for now, estimate that the leader spends 30% of
+ * its time servicing each worker, and the remainder executing the
+ * parallel plan.
+ */
+ leader_contribution = 1.0 - (0.3 * path->parallel_workers);
+ if (leader_contribution > 0)
+ parallel_divisor += leader_contribution;
+
+ return parallel_divisor;
+ }
+
+ /*
+ * compute_bitmap_pages
+ *
+ * compute number of pages fetched from heap in bitmap heap scan.
+ */
+ double
+ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
+ int loop_count, Cost *cost, double *tuple)
+ {
+ Cost indexTotalCost;
+ Selectivity indexSelectivity;
+ double T;
+ double pages_fetched;
+ double tuples_fetched;
+
+ /*
+ * Fetch total cost of obtaining the bitmap, as well as its total
+ * selectivity.
+ */
+ cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
+
+ /*
+ * Estimate number of main-table pages fetched.
+ */
+ tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
+
+ T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
+
+ if (loop_count > 1)
+ {
+ /*
+ * For repeated bitmap scans, scale up the number of tuples fetched in
+ * the Mackert and Lohman formula by the number of scans, so that we
+ * estimate the number of pages fetched by all the scans. Then
+ * pro-rate for one scan.
+ */
+ pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
+ baserel->pages,
+ get_indexpath_pages(bitmapqual),
+ root);
+ pages_fetched /= loop_count;
+ }
+ else
+ {
+ /*
+ * For a single scan, the number of heap pages that need to be fetched
+ * is the same as the Mackert and Lohman formula for the case T <= b
+ * (ie, no re-reads needed).
+ */
+ pages_fetched =
+ (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+ }
+
+ if (pages_fetched >= T)
+ pages_fetched = T;
+ else
+ pages_fetched = ceil(pages_fetched);
+
+ if (cost)
+ *cost = indexTotalCost;
+ if (tuple)
+ *tuple = tuples_fetched;
+
+ return pages_fetched;
+ }
* Planning is complete, we just need to convert the selected
* Path into a Plan.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
static Plan *create_append_plan(PlannerInfo *root, AppendPath *best_path);
static Plan *create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path);
static Result *create_result_plan(PlannerInfo *root, ResultPath *best_path);
+#ifdef XCP
+static void adjust_subplan_distribution(PlannerInfo *root, Distribution *pathd,
+ Distribution *subd);
+static RemoteSubplan *create_remotescan_plan(PlannerInfo *root,
+ RemoteSubPath *best_path);
+static char *get_internal_cursor(void);
+#endif
+ static ProjectSet *create_project_set_plan(PlannerInfo *root, ProjectSetPath *best_path);
static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path,
int flags);
static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path,
Index scanrelid, List *functions, bool funcordinality);
static ValuesScan *make_valuesscan(List *qptlist, List *qpqual,
Index scanrelid, List *values_lists);
+ static TableFuncScan *make_tablefuncscan(List *qptlist, List *qpqual,
+ Index scanrelid, TableFunc *tablefunc);
static CteScan *make_ctescan(List *qptlist, List *qpqual,
Index scanrelid, int ctePlanId, int cteParam);
+ static NamedTuplestoreScan *make_namedtuplestorescan(List *qptlist, List *qpqual,
+ Index scanrelid, char *enrname);
static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
Index scanrelid, int wtParam);
- static Append *make_append(List *appendplans, List *tlist);
+ static Append *make_append(List *appendplans, List *tlist, List *partitioned_rels);
-static RecursiveUnion *make_recursive_union(List *tlist,
+static RecursiveUnion *make_recursive_union(PlannerInfo *root,
+ List *tlist,
Plan *lefttree,
Plan *righttree,
int wtParam,
List *resultRelations, List *subplans,
List *withCheckOptionLists, List *returningLists,
List *rowMarks, OnConflictExpr *onconflict, int epqParam);
+ static GatherMerge *create_gather_merge_plan(PlannerInfo *root,
+ GatherMergePath *best_path);
+#ifdef XCP
+static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll,
+ bool nulls_first,int numCols, AttrNumber *sortColIdx,
+ Oid *sortOperators, Oid *collations, bool *nullsFirst);
+#endif
+
+static RemoteSubplan *find_push_down_plan(Plan *plan, bool force);
/*
* create_plan
case T_Limit:
plan = (Plan *) create_limit_plan(root,
(LimitPath *) best_path,
- flags);
+ flags, 0, 1);
break;
+ case T_GatherMerge:
+ plan = (Plan *) create_gather_merge_plan(root,
+ (GatherMergePath *) best_path);
+ break;
default:
elog(ERROR, "unrecognized node type: %d",
(int) best_path->pathtype);
*/
if (!is_projection_capable_plan(subplan) &&
!tlist_same_exprs(newtlist, subplan->targetlist))
- subplan = inject_projection_plan(subplan, newtlist);
+ subplan = inject_projection_plan(subplan, newtlist,
+ best_path->path.parallel_safe);
else
subplan->targetlist = newtlist;
+#ifdef XCP
+ /*
+ * RemoteSubplan is conditionally projection capable - it is pushing
+ * projection to the data nodes
+ */
+ if (IsA(subplan, RemoteSubplan))
+ subplan->lefttree->targetlist = newtlist;
+#endif
}
/*
plan->plan_rows = 1;
plan->plan_width = mminfo->path->pathtarget->width;
plan->parallel_aware = false;
+ plan->parallel_safe = mminfo->path->parallel_safe;
+ /*
+ * XL: Add a remote subplan, splitting the LIMIT into a remote and
+ * local part LIMIT parts.
+ *
+ * XXX This should probably happen when constructing the path in
+ * create_minmaxagg_path(), not this late.
+ *
+ * XXX The costing in here is mostly bogus. Not that it'd matter
+ * this late, though.
+ */
+ if (mminfo->path->distribution)
+ {
+ plan = (Plan *) make_remotesubplan(root, plan,
+ NULL,
+ mminfo->path->distribution,
+ mminfo->path->pathkeys);
+
+ plan = (Plan *) make_limit(plan,
+ subparse->limitOffset,
+ subparse->limitCount,
+ 0, 1);
+
+ plan->startup_cost = mminfo->path->startup_cost;
+ plan->total_cost = mminfo->pathcost;
+ plan->plan_rows = 1;
+ plan->plan_width = mminfo->path->pathtarget->width;
+ plan->parallel_aware = false;
+ }
+
/* Convert the plan into an InitPlan in the outer query. */
SS_make_initplan_from_plan(root, subroot, plan, mminfo->param);
}
return node;
}
- tle = tlist_member((Node *) em->em_expr, tlist);
+#ifdef XCP
+/*
+ * make_remotesubplan
+ * Create a RemoteSubplan node to execute subplan on remote nodes.
+ * leftree - the subplan which we want to push down to remote node.
+ * resultDistribution - the distribution of the remote result. May be NULL -
+ * results are coming to the invoking node
+ * execDistribution - determines how source data of the subplan are
+ * distributed, where we should send the subplan and how combine results.
+ * pathkeys - the remote subplan is sorted according to these keys, executor
+ * should perform merge sort of incoming tuples
+ */
+RemoteSubplan *
+make_remotesubplan(PlannerInfo *root,
+ Plan *lefttree,
+ Distribution *resultDistribution,
+ Distribution *execDistribution,
+ List *pathkeys)
+{
+ RemoteSubplan *node = makeNode(RemoteSubplan);
+ Plan *plan = &node->scan.plan;
+ Bitmapset *tmpset;
+ int nodenum;
+
+ /* Sanity checks */
+ Assert(!equal(resultDistribution, execDistribution));
+ Assert(!IsA(lefttree, RemoteSubplan));
+
+ if (resultDistribution)
+ {
+ node->distributionType = resultDistribution->distributionType;
+ node->distributionKey = InvalidAttrNumber;
+ if (resultDistribution->distributionExpr)
+ {
+ ListCell *lc;
+ Expr *expr;
+
+ /* XXX Is that correct to reference a column of different type? */
+ if (IsA(resultDistribution->distributionExpr, RelabelType))
+ expr = ((RelabelType *) resultDistribution->distributionExpr)->arg;
+ else
+ expr = (Expr *) resultDistribution->distributionExpr;
+
+ /* Find distribution expression in the target list */
+ foreach(lc, lefttree->targetlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+ if (equal(tle->expr, expr))
+ {
+ node->distributionKey = tle->resno;
+ break;
+ }
+ }
+
+ if (node->distributionKey == InvalidAttrNumber)
+ {
+ TargetEntry *newtle;
+
+ /* The expression is not found, need to add junk */
+ newtle = makeTargetEntry(expr,
+ list_length(lefttree->targetlist) + 1,
+ NULL,
+ true);
+
+ if (is_projection_capable_plan(lefttree))
+ {
+ /* Ok to modify subplan's target list */
+ lefttree->targetlist = lappend(lefttree->targetlist, newtle);
+ }
+ else
+ {
+ /* Use Result node to calculate expression */
+ List *newtlist = list_copy(lefttree->targetlist);
+ newtlist = lappend(newtlist, newtle);
+ lefttree = (Plan *) make_result(newtlist, NULL, lefttree);
+ }
+
+ node->distributionKey = newtle->resno;
+ }
+ }
+ /*
+ * The distributionNodes describes result distribution
+ */
+ tmpset = bms_copy(resultDistribution->nodes);
+ node->distributionNodes = NIL;
+ while ((nodenum = bms_first_member(tmpset)) >= 0)
+ node->distributionNodes = lappend_int(node->distributionNodes,
+ nodenum);
+ bms_free(tmpset);
+ /*
+ * The distributionRestrict defines the set of nodes where results are
+ * actually shipped. These are the nodes where upper level step
+ * is executed.
+ */
+ if (resultDistribution->restrictNodes)
+ {
+ tmpset = bms_copy(resultDistribution->restrictNodes);
+ node->distributionRestrict = NIL;
+ while ((nodenum = bms_first_member(tmpset)) >= 0)
+ node->distributionRestrict =
+ lappend_int(node->distributionRestrict, nodenum);
+ bms_free(tmpset);
+ }
+ else
+ node->distributionRestrict = list_copy(node->distributionNodes);
+ }
+ else
+ {
+ node->distributionType = LOCATOR_TYPE_NONE;
+ node->distributionKey = InvalidAttrNumber;
+ node->distributionNodes = NIL;
+ }
+
+ /* determine where subplan will be executed */
+ if (execDistribution)
+ {
+ if (execDistribution->restrictNodes)
+ tmpset = bms_copy(execDistribution->restrictNodes);
+ else
+ tmpset = bms_copy(execDistribution->nodes);
+ node->nodeList = NIL;
+ while ((nodenum = bms_first_member(tmpset)) >= 0)
+ node->nodeList = lappend_int(node->nodeList, nodenum);
+ bms_free(tmpset);
+ node->execOnAll = list_length(node->nodeList) == 1 ||
+ !IsLocatorReplicated(execDistribution->distributionType);
+ }
+ else
+ {
+ /*
+ * Prepare single execution of replicated subplan. Choose one node from
+ * the execution node list, preferrably the node is also a member of
+ * the list of result nodes, so later all node executors contact the
+ * same node to get tuples
+ */
+ tmpset = NULL;
+ if (!bms_is_empty(resultDistribution->restrictNodes))
+ tmpset = bms_copy(resultDistribution->restrictNodes);
+ else
+ tmpset = bms_copy(resultDistribution->nodes);
+ /*
+ * If result goes on single node execute subplan locally
+ */
+ if (bms_num_members(tmpset) > 1)
+ {
+ /* get one execution node TODO: load balancing */
+ nodenum = bms_any_member(tmpset);
+ node->nodeList = list_make1_int(nodenum);
+ node->execOnAll = true;
+ }
+ else
+ {
+ node->nodeList = NIL;
+ node->execOnAll = false;
+ }
+ bms_free(tmpset);
+ }
+
+ /* We do not need to merge sort if only one node is yielding tuples */
+ if (pathkeys && node->execOnAll && list_length(node->nodeList) > 1)
+ {
+ List *tlist = lefttree->targetlist;
+ ListCell *i;
+ int numsortkeys;
+ AttrNumber *sortColIdx;
+ Oid *sortOperators;
+ Oid *collations;
+ bool *nullsFirst;
+
+ /*
+ * We will need at most list_length(pathkeys) sort columns; possibly less
+ */
+ numsortkeys = list_length(pathkeys);
+ sortColIdx = (AttrNumber *) palloc(numsortkeys * sizeof(AttrNumber));
+ sortOperators = (Oid *) palloc(numsortkeys * sizeof(Oid));
+ collations = (Oid *) palloc(numsortkeys * sizeof(Oid));
+ nullsFirst = (bool *) palloc(numsortkeys * sizeof(bool));
+
+ numsortkeys = 0;
+
+ foreach(i, pathkeys)
+ {
+ PathKey *pathkey = (PathKey *) lfirst(i);
+ EquivalenceClass *ec = pathkey->pk_eclass;
+ TargetEntry *tle = NULL;
+ Oid pk_datatype = InvalidOid;
+ Oid sortop;
+ ListCell *j;
+
+ if (ec->ec_has_volatile)
+ {
+ /*
+ * If the pathkey's EquivalenceClass is volatile, then it must
+ * have come from an ORDER BY clause, and we have to match it to
+ * that same targetlist entry.
+ */
+ if (ec->ec_sortref == 0) /* can't happen */
+ elog(ERROR, "volatile EquivalenceClass has no sortref");
+ tle = get_sortgroupref_tle(ec->ec_sortref, tlist);
+ Assert(tle);
+ Assert(list_length(ec->ec_members) == 1);
+ pk_datatype = ((EquivalenceMember *) linitial(ec->ec_members))->em_datatype;
+ }
+ else
+ {
+ /*
+ * Otherwise, we can sort by any non-constant expression listed in
+ * the pathkey's EquivalenceClass. For now, we take the first one
+ * that corresponds to an available item in the tlist. If there
+ * isn't any, use the first one that is an expression in the
+ * input's vars. (The non-const restriction only matters if the
+ * EC is below_outer_join; but if it isn't, it won't contain
+ * consts anyway, else we'd have discarded the pathkey as
+ * redundant.)
+ *
+ * XXX if we have a choice, is there any way of figuring out which
+ * might be cheapest to execute? (For example, int4lt is likely
+ * much cheaper to execute than numericlt, but both might appear
+ * in the same equivalence class...) Not clear that we ever will
+ * have an interesting choice in practice, so it may not matter.
+ */
+ foreach(j, ec->ec_members)
+ {
+ EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
+
+ if (em->em_is_const)
+ continue;
+
- tle = tlist_member_ignore_relabel((Node *) em->em_expr, tlist);
++ tle = tlist_member(em->em_expr, tlist);
+ if (tle)
+ {
+ pk_datatype = em->em_datatype;
+ break; /* found expr already in tlist */
+ }
+
+ /*
+ * We can also use it if the pathkey expression is a relabel
+ * of the tlist entry, or vice versa. This is needed for
+ * binary-compatible cases (cf. make_pathkey_from_sortinfo).
+ * We prefer an exact match, though, so we do the basic search
+ * first.
+ */
++ tle = tlist_member_ignore_relabel(em->em_expr, tlist);
+ if (tle)
+ {
+ pk_datatype = em->em_datatype;
+ break; /* found expr already in tlist */
+ }
+ }
+
+ if (!tle)
+ {
+ /* No matching tlist item; look for a computable expression */
+ Expr *sortexpr = NULL;
+
+ foreach(j, ec->ec_members)
+ {
+ EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
+ List *exprvars;
+ ListCell *k;
+
+ if (em->em_is_const)
+ continue;
+ sortexpr = em->em_expr;
+ exprvars = pull_var_clause((Node *) sortexpr,
+ PVC_INCLUDE_AGGREGATES |
+ PVC_INCLUDE_PLACEHOLDERS);
+ foreach(k, exprvars)
+ {
+ if (!tlist_member_ignore_relabel(lfirst(k), tlist))
+ break;
+ }
+ list_free(exprvars);
+ if (!k)
+ {
+ pk_datatype = em->em_datatype;
+ break; /* found usable expression */
+ }
+ }
+ if (!j)
+ elog(ERROR, "could not find pathkey item to sort");
+
+ /*
+ * Do we need to insert a Result node?
+ */
+ if (!is_projection_capable_plan(lefttree))
+ {
+ /* copy needed so we don't modify input's tlist below */
+ tlist = copyObject(tlist);
+ lefttree = (Plan *) make_result(tlist, NULL, lefttree);
+ }
+
+ /*
+ * Add resjunk entry to input's tlist
+ */
+ tle = makeTargetEntry(sortexpr,
+ list_length(tlist) + 1,
+ NULL,
+ true);
+ tlist = lappend(tlist, tle);
+ lefttree->targetlist = tlist; /* just in case NIL before */
+ }
+ }
+
+ /*
+ * Look up the correct sort operator from the PathKey's slightly
+ * abstracted representation.
+ */
+ sortop = get_opfamily_member(pathkey->pk_opfamily,
+ pk_datatype,
+ pk_datatype,
+ pathkey->pk_strategy);
+ if (!OidIsValid(sortop)) /* should not happen */
+ elog(ERROR, "could not find member %d(%u,%u) of opfamily %u",
+ pathkey->pk_strategy, pk_datatype, pk_datatype,
+ pathkey->pk_opfamily);
+
+ /*
+ * The column might already be selected as a sort key, if the pathkeys
+ * contain duplicate entries. (This can happen in scenarios where
+ * multiple mergejoinable clauses mention the same var, for example.)
+ * So enter it only once in the sort arrays.
+ */
+ numsortkeys = add_sort_column(tle->resno,
+ sortop,
+ pathkey->pk_eclass->ec_collation,
+ pathkey->pk_nulls_first,
+ numsortkeys,
+ sortColIdx, sortOperators,
+ collations, nullsFirst);
+ }
+ Assert(numsortkeys > 0);
+
+ node->sort = makeNode(SimpleSort);
+ node->sort->numCols = numsortkeys;
+ node->sort->sortColIdx = sortColIdx;
+ node->sort->sortOperators = sortOperators;
+ node->sort->sortCollations = collations;
+ node->sort->nullsFirst = nullsFirst;
+ }
+
+ plan->qual = NIL;
+ plan->targetlist = lefttree->targetlist;
+ plan->lefttree = lefttree;
+ plan->righttree = NULL;
+ copy_plan_costsize(plan, lefttree);
+
+ node->cursor = get_internal_cursor();
+ node->unique = 0;
+ return node;
+}
+#endif /* XCP */
+
+
ForeignScan *
make_foreignscan(List *qptlist,
List *qpqual,
case T_MergeAppend:
case T_RecursiveUnion:
return false;
+#ifdef XCP
+ /*
+ * Remote subplan may push down projection to the data nodes if do not
+ * performs merge sort
+ */
+ case T_RemoteSubplan:
+ return ((RemoteSubplan *) plan)->sort == NULL &&
+ is_projection_capable_plan(plan->lefttree);
+#endif
+ case T_ProjectSet:
+
+ /*
+ * Although ProjectSet certainly projects, say "no" because we
+ * don't want the planner to randomly replace its tlist with
+ * something else; the SRFs have to stay at top level. This might
+ * get relaxed later.
+ */
+ return false;
default:
break;
}
* scan all the rows anyway.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* planner.c
* The query optimizer external interface.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
static PathTarget *make_sort_input_target(PlannerInfo *root,
PathTarget *final_target,
bool *have_postponed_srfs);
+static bool equal_distributions(PlannerInfo *root, Distribution *dst1,
+ Distribution *dst2);
+static bool grouping_distribution_match(PlannerInfo *root, Query *parse,
+ Path *path, List *clauses);
+static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse,
+ Path *path);
+static Path *adjust_path_distribution(PlannerInfo *root, Query *parse,
+ Path *path);
+static bool can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path);
+static bool can_push_down_window(PlannerInfo *root, Path *path);
+ static void adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel,
+ List *targets, List *targets_contain_srfs);
+
/*****************************************************************************
*
ListCell *lp,
*lr;
-
- /* Cursor options may come from caller or from DECLARE CURSOR stmt */
- if (parse->utilityStmt &&
- IsA(parse->utilityStmt, DeclareCursorStmt))
- cursorOptions |= ((DeclareCursorStmt *) parse->utilityStmt)->options;
-
+#ifdef XCP
+ if (IS_PGXC_LOCAL_COORDINATOR && parse->utilityStmt &&
+ IsA(parse->utilityStmt, RemoteQuery))
+ return pgxc_direct_planner(parse, cursorOptions, boundParams);
+#endif
/*
* Set up global state for this planner invocation. This data is needed
* across all levels of sub-Query that might exist in the given command,
result->rowMarks = glob->finalrowmarks;
result->relationOids = glob->relationOids;
result->invalItems = glob->invalItems;
+#ifdef XCP
+ result->distributionType = LOCATOR_TYPE_NONE;
+ result->distributionKey = InvalidAttrNumber;
+ result->distributionNodes = NULL;
+#endif
result->nParamExec = glob->nParamExec;
+ /* utilityStmt should be null, but we might as well copy it */
+ result->utilityStmt = parse->utilityStmt;
+ result->stmt_location = parse->stmt_location;
+ result->stmt_len = parse->stmt_len;
return result;
}
memset(root->upper_targets, 0, sizeof(root->upper_targets));
root->processed_tlist = NIL;
root->grouping_map = NULL;
+ root->recursiveOk = true;
+
root->minmax_aggs = NIL;
+ root->qual_security_level = 0;
root->hasInheritedTarget = false;
root->hasRecursion = hasRecursion;
if (hasRecursion)
*/
CheckSelectLocking(parse, ((RowMarkClause *)
linitial(parse->rowMarks))->strength);
- Bitmapset *baserels = get_base_rel_indexes((Node *) parse->jointree);
+
+ if (parse->jointree)
+ {
++ Bitmapset *baserels = get_relids_in_jointree((Node *)
++ parse->jointree, false);
+ int x, num_rels = 0;
+ bool dist_found = false;
+
+ while ((x = bms_first_member(baserels)) >= 0)
+ {
+ RangeTblEntry *rte = rt_fetch(x, parse->rtable);
+ RelationLocInfo *locinfo = NULL;
+ if (OidIsValid(rte->relid))
+ locinfo = GetRelationLocInfo(rte->relid);
+ if (locinfo && !IsRelationReplicated(locinfo))
+ dist_found = true;
+ num_rels++;
+ }
+
+ if (dist_found && num_rels > 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("%s is not allowed with joins",
+ LCS_asString(((RowMarkClause *)
+ linitial(parse->rowMarks))->strength))));
+ }
}
else
{
if (can_hash)
{
- hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
- agg_costs,
- dNumGroups);
-
- /*
- * Provided that the estimated size of the hashtable does not exceed
- * work_mem, we'll generate a HashAgg Path, although if we were unable
- * to sort above, then we'd better generate a Path, so that we at
- * least have one.
- */
- if (hashaggtablesize < work_mem * 1024L ||
- grouped_rel->pathlist == NIL)
+ if (parse->groupingSets)
{
- /* Don't mess with the cheapest path directly. */
- Path *path = cheapest_path;
-
/*
- * If the grouping can't be fully pushed down, we'll push down the
- * first phase of the aggregate, and redistribute only the partial
- * results.
- *
- * If if can be pushed down, disable construction of complex
- * distributed paths.
+ * Try for a hash-only groupingsets path over unsorted input.
*/
- if (! can_push_down_grouping(root, parse, path))
- path = create_remotesubplan_path(root, path, NULL);
- else
- try_distributed_aggregation = false;
+ consider_groupingsets_paths(root, grouped_rel,
+ cheapest_path, false, true, target,
+ gd, agg_costs, dNumGroups);
+ }
+ else
+ {
+ hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+ agg_costs,
+ dNumGroups);
/*
- * We just need an Agg over the cheapest-total input path, since
- * input order won't matter.
+ * Provided that the estimated size of the hashtable does not
+ * exceed work_mem, we'll generate a HashAgg Path, although if we
+ * were unable to sort above, then we'd better generate a Path, so
+ * that we at least have one.
*/
- add_path(grouped_rel, (Path *)
- create_agg_path(root, grouped_rel,
- path,
- target,
- AGG_HASHED,
- AGGSPLIT_SIMPLE,
- parse->groupClause,
- (List *) parse->havingQual,
- agg_costs,
- dNumGroups));
+ if (hashaggtablesize < work_mem * 1024L ||
+ grouped_rel->pathlist == NIL)
+ {
++ /* Don't mess with the cheapest path directly. */
++ Path *path = cheapest_path;
++
++ /*
++ * If the grouping can't be fully pushed down, we'll push down the
++ * first phase of the aggregate, and redistribute only the partial
++ * results.
++ *
++ * If if can be pushed down, disable construction of complex
++ * distributed paths.
++ */
++ if (! can_push_down_grouping(root, parse, path))
++ path = create_remotesubplan_path(root, path, NULL);
++ else
++ try_distributed_aggregation = false;
++
+ /*
+ * We just need an Agg over the cheapest-total input path,
+ * since input order won't matter.
+ */
+ add_path(grouped_rel, (Path *)
+ create_agg_path(root, grouped_rel,
- cheapest_path,
++ path,
+ target,
+ AGG_HASHED,
+ AGGSPLIT_SIMPLE,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ agg_costs,
+ dNumGroups));
+ }
}
/*
}
}
- /* Give a helpful error if we failed to find any implementation */
- if (grouped_rel->pathlist == NIL)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("could not implement GROUP BY"),
- errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
-
- /*
- * If there is an FDW that's responsible for all baserels of the query,
- * let it consider adding ForeignPaths.
- */
- if (grouped_rel->fdwroutine &&
- grouped_rel->fdwroutine->GetForeignUpperPaths)
- grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
- input_rel, grouped_rel);
-
- /* Let extensions possibly add some more paths */
- if (create_upper_paths_hook)
- (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
- input_rel, grouped_rel);
-
- /* Now choose the best path(s) */
- set_cheapest(grouped_rel);
-
- /*
- * We've been using the partial pathlist for the grouped relation to hold
- * partially aggregated paths, but that's actually a little bit bogus
- * because it's unsafe for later planning stages -- like ordered_rel ---
- * to get the idea that they can use these partial paths as if they didn't
- * need a FinalizeAggregate step. Zap the partial pathlist at this stage
- * so we don't get confused.
- */
- grouped_rel->partial_pathlist = NIL;
-
- return grouped_rel;
-}
-
-
-/*
- * For a given input path, consider the possible ways of doing grouping sets on
- * it, by combinations of hashing and sorting. This can be called multiple
- * times, so it's important that it not scribble on input. No result is
- * returned, but any generated paths are added to grouped_rel.
- */
-static void
-consider_groupingsets_paths(PlannerInfo *root,
- RelOptInfo *grouped_rel,
- Path *path,
- bool is_sorted,
- bool can_hash,
- PathTarget *target,
- grouping_sets_data *gd,
- const AggClauseCosts *agg_costs,
- double dNumGroups)
-{
- Query *parse = root->parse;
+ /* Generate XL aggregate paths, with distributed 2-phase aggregation. */
/*
- * If we're not being offered sorted input, then only consider plans that
- * can be done entirely by hashing.
+ * If there were no partial paths, we did not initialize any of the
+ * partial paths above. If that's the case, initialize here.
*
- * We can hash everything if it looks like it'll fit in work_mem. But if
- * the input is actually sorted despite not being advertised as such, we
- * prefer to make use of that in order to use less memory.
+ * XXX The reason why the initialization block at the beginning is not
+ * simply performed unconditionally is that we may skip it if we've been
+ * successful in fully pushing down any of the aggregates, and entirely
+ * skip generating the XL paths.
*
- * If none of the grouping sets are sortable, then ignore the work_mem
- * limit and generate a path anyway, since otherwise we'll just fail.
+ * XXX Can we simply use the same estimates as regular partial aggregates,
+ * or do we need to invent something else? It might be a better idea to
+ * use estimates for the whole result here (e.g. total number of groups)
+ * instead of the partial ones. Underestimates often have more severe
+ * consequences (e.g. OOM with HashAggregate) than overestimates, so this
+ * seems like a more defensive approach.
+ *
+ * XXX After thinking a bit more about the estimation, it may depend on
+ * pushdown - if the aggregate is fully pushed down (as above, we can
+ * probably use dNumGroups/numberOfNodes as a cardinality estimate, as
+ * we know the per-node groupings won't overlap. But here we need to be
+ * more careful.
*/
- if (!is_sorted)
+ if (try_distributed_aggregation)
{
- List *new_rollups = NIL;
- RollupData *unhashed_rollup = NULL;
- List *sets_data;
- List *empty_sets_data = NIL;
- List *empty_sets = NIL;
- ListCell *lc;
- ListCell *l_start = list_head(gd->rollups);
- AggStrategy strat = AGG_HASHED;
- Size hashsize;
- double exclude_groups = 0.0;
-
- Assert(can_hash);
-
- if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
- {
- unhashed_rollup = lfirst(l_start);
- exclude_groups = unhashed_rollup->numGroups;
- l_start = lnext(l_start);
- }
+ partial_grouping_target = make_partial_grouping_target(root, target);
- hashsize = estimate_hashagg_tablesize(path,
- agg_costs,
- dNumGroups - exclude_groups);
+ /* Estimate number of partial groups. */
+ dNumPartialGroups = get_number_of_groups(root,
+ cheapest_path->rows,
- NIL,
- NIL);
++ gd);
/*
- * gd->rollups is empty if we have only unsortable columns to work
- * with. Override work_mem in that case; otherwise, we'll rely on the
- * sorted-input case to generate usable mixed paths.
+ * Collect statistics about aggregates for estimating costs of
+ * performing aggregation in parallel.
*/
- if (hashsize > work_mem * 1024L && gd->rollups)
- return; /* nope, won't fit */
+ MemSet(&agg_partial_costs, 0, sizeof(AggClauseCosts));
+ MemSet(&agg_final_costs, 0, sizeof(AggClauseCosts));
+ if (parse->hasAggs)
+ {
+ /* partial phase */
+ get_agg_clause_costs(root, (Node *) partial_grouping_target->exprs,
+ AGGSPLIT_INITIAL_SERIAL,
+ &agg_partial_costs);
+
+ /* final phase */
+ get_agg_clause_costs(root, (Node *) target->exprs,
+ AGGSPLIT_FINAL_DESERIAL,
+ &agg_final_costs);
+ get_agg_clause_costs(root, parse->havingQual,
+ AGGSPLIT_FINAL_DESERIAL,
+ &agg_final_costs);
+ }
+ }
+ /* Build final XL grouping paths */
+ if (can_sort && try_distributed_aggregation)
+ {
/*
- * We need to burst the existing rollups list into individual grouping
- * sets and recompute a groupClause for each set.
+ * Use any available suitably-sorted path as input, and also consider
+ * sorting the cheapest-total path.
*/
- sets_data = list_copy(gd->unsortable_sets);
-
- for_each_cell(lc, l_start)
+ foreach(lc, input_rel->pathlist)
{
- RollupData *rollup = lfirst(lc);
+ Path *path = (Path *) lfirst(lc);
+ bool is_sorted;
+
+ is_sorted = pathkeys_contained_in(root->group_pathkeys,
- path->pathkeys);
++ path->pathkeys);
+
+ /*
+ * XL: Can it happen that the cheapest path can't be pushed down,
+ * while some other path could be? Perhaps we should move the check
+ * if a path can be pushed down up, and add another OR condition
+ * to consider all paths that can be pushed down?
+ *
+ * if (path == cheapest_path || is_sorted || can_push_down)
+ */
+ if (path == cheapest_path || is_sorted)
+ {
+ /*
+ * We can't really beat paths that we managed to fully push
+ * down above, so we can skip them entirely.
+ *
+ * XXX Not constructing any paths, so we can do this before
+ * adding the Sort path.
+ */
+ if (can_push_down_grouping(root, parse, path))
+ continue;
+
+ /* Sort the cheapest-total path if it isn't already sorted */
+ if (!is_sorted)
+ path = (Path *) create_sort_path(root,
+ grouped_rel,
+ path,
+ root->group_pathkeys,
+ -1.0);
+
+ /* Now decide what to stick atop it */
+ if (parse->groupingSets)
+ {
+ /*
+ * TODO 2-phase aggregation for grouping sets paths not
+ * supported yet, but this the place where such paths
+ * should be constructed.
+ */
+ }
+ else if (parse->hasAggs)
+ {
+ /*
+ * We have aggregation, possibly with plain GROUP BY. Make
+ * an AggPath.
+ */
+
+ path = (Path *) create_agg_path(root,
+ grouped_rel,
+ path,
+ partial_grouping_target,
+ parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ AGGSPLIT_INITIAL_SERIAL,
+ parse->groupClause,
+ NIL,
+ &agg_partial_costs,
+ dNumPartialGroups);
+
+ path = create_remotesubplan_path(root, path, NULL);
+
+ /*
+ * We generate two paths, differing in the second phase
+ * implementation (sort and hash).
+ */
+
+ add_path(grouped_rel, (Path *)
+ create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ AGGSPLIT_FINAL_DESERIAL,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups));
+
+ if (can_hash)
+ add_path(grouped_rel, (Path *)
+ create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ AGG_HASHED,
+ AGGSPLIT_FINAL_DESERIAL,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups));
+ }
+ else if (parse->groupClause)
+ {
+ /*
+ * We have GROUP BY without aggregation or grouping sets.
+ * Make a GroupPath.
+ */
+ path = (Path *) create_group_path(root,
+ grouped_rel,
+ path,
+ partial_grouping_target,
+ parse->groupClause,
+ NIL,
+ dNumPartialGroups);
+
+ path = create_remotesubplan_path(root, path, NULL);
+
+ add_path(grouped_rel, (Path *)
+ create_group_path(root,
+ grouped_rel,
+ path,
+ target,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ dNumGroups));
+
+ }
+ else
+ {
+ /* Other cases should have been handled above */
+ Assert(false);
+ }
+ }
+ }
+
+ /*
+ * So far we've only constructed simple paths combining partial and
+ * distributed aggregate paths, i.e.
+ *
+ * Finalize -> RemoteSubplan -> Gather -> Partial
+ *
+ * It may however be more efficient to reduce the amount of data
+ * transferred over the network by generating paths like this:
+ *
+ * Finalize -> RemoteSubplan -> Combine -> Gather -> Partial
+ *
+ * where Combine deserialized the aggstates, combines them and then
+ * serializes them again. This AggSplit case is not defined yet, but
+ * should not be hard to add.
+ *
+ * We only want to do this for partial paths with RemoteSubplan on
+ * top of them, i.e. when the whole aggregate was not pushed down.
+ *
+ * XXX Gather output is never sorted, so we can only bother with the
+ * cheapest partial path here (just like above).
+ *
+ * XXX This only generates paths with both the combine and finalize
+ * steps using the same implementation (sort+sort or hash+hash). Maybe
+ * we should relax that, and allow hash+sort or sort+hash?
+ *
+ * XXX grouped_rel->partial_pathlist may be empty here, if the planner
+ * did not consider parallel paths (try_parallel_aggregation=false).
+ * But that's OK - we only want to put the combine on top of a Gather,
+ * so if there's none we're done.
+ *
+ * XXX The "combine" paths seem not to be picked up, most likely
+ * because of bad costing, not reflecting the reduction in number of
+ * rows transferred over the network.
+ */
+ if (grouped_rel->partial_pathlist)
+ {
+ Path *path = (Path *) linitial(grouped_rel->partial_pathlist);
+ double total_groups = path->rows * path->parallel_workers;
+
+ /* We don't care about paths that were fully pushed down. */
+ if (! can_push_down_grouping(root, parse, path))
+ {
+ path = (Path *) create_gather_path(root,
+ grouped_rel,
+ path,
+ partial_grouping_target,
+ NULL,
+ &total_groups);
+
+ /*
+ * Gather is always unsorted, so we'll need to sort, unless
+ * there's no GROUP BY clause, in which case there will only be a
+ * single group.
+ */
+ if (parse->groupClause)
+ path = (Path *) create_sort_path(root,
+ grouped_rel,
+ path,
+ root->group_pathkeys,
+ -1.0);
+
+ /* Intermediate combine phase. */
+ if (parse->hasAggs)
+ {
+ path = (Path *) create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ AGGSPLIT_COMBINE,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups);
+
+ path = create_remotesubplan_path(root, path, NULL);
+
+ add_path(grouped_rel, (Path *)
+ create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ AGGSPLIT_FINAL_DESERIAL,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups));
+ }
+ else
+ {
+ path = (Path *) create_group_path(root,
+ grouped_rel,
+ path,
+ target,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ dNumGroups);
+
+ path = create_remotesubplan_path(root, path, NULL);
+
+ add_path(grouped_rel, (Path *)
+ create_group_path(root,
+ grouped_rel,
+ path,
+ target,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ dNumGroups));
+ }
+ }
+ }
+ }
+
+ if (can_hash && try_distributed_aggregation)
+ {
+ hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+ agg_costs,
+ dNumGroups);
+
+ /*
+ * Provided that the estimated size of the hashtable does not exceed
+ * work_mem, we'll generate a HashAgg Path, although if we were unable
+ * to sort above, then we'd better generate a Path, so that we at
+ * least have one.
+ */
+ if (hashaggtablesize < work_mem * 1024L ||
+ grouped_rel->pathlist == NIL)
+ {
+ /* If the whole aggregate was pushed down, we're done. */
+ if (! can_push_down_grouping(root, parse, cheapest_path))
+ {
+ Path *path, *agg_path;
+
+ path = (Path *) create_agg_path(root,
+ grouped_rel,
+ cheapest_path,
+ partial_grouping_target,
+ AGG_HASHED,
+ AGGSPLIT_INITIAL_SERIAL,
+ parse->groupClause,
+ NIL,
+ &agg_partial_costs,
+ dNumPartialGroups);
+
+ /* keep partially aggregated path for the can_sort branch */
+ agg_path = path;
+
+ path = create_remotesubplan_path(root, path, NULL);
+
+ /* Generate paths with both hash and sort second phase. */
+
+ add_path(grouped_rel, (Path *)
+ create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ AGG_HASHED,
+ AGGSPLIT_FINAL_DESERIAL,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups));
+
+ if (can_sort)
+ {
+ /*
+ * AGG_HASHED aggregate paths are always unsorted, so add
+ * a Sorted node for the final AGG_SORTED step.
+ */
+ path = (Path *) create_sort_path(root,
+ grouped_rel,
+ agg_path,
+ root->group_pathkeys,
+ -1.0);
+
+ path = create_remotesubplan_path(root, path, NULL);
+
+ add_path(grouped_rel, (Path *)
+ create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+ AGGSPLIT_FINAL_DESERIAL,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups));
+ }
+ }
+ }
+
+ /*
+ * Generate a path with the extra combine phase.
+ *
+ * XXX See the comments in the block generating combine paths for
+ * the sorted case.
+ */
+ if (grouped_rel->partial_pathlist)
+ {
+ Path *path = (Path *) linitial(grouped_rel->partial_pathlist);
+
+ hashaggtablesize = estimate_hashagg_tablesize(path,
+ &agg_final_costs,
+ dNumGroups);
+
+ /*
+ * Ignore the path if the hash table won't fit into memory, or
+ * if we managed to push dowh the whole aggregation.
+ */
+ if ((hashaggtablesize < work_mem * 1024L) &&
+ (! can_push_down_grouping(root, parse, path)))
+ {
+ double total_groups = path->rows * path->parallel_workers;
+
+ path = (Path *) create_gather_path(root,
+ grouped_rel,
+ path,
+ partial_grouping_target,
+ NULL,
+ &total_groups);
+
+ path = (Path *) create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ AGG_HASHED,
+ AGGSPLIT_COMBINE,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups);
+
+ /* We know the full push down can't happen, so redistribute. */
+ path = create_remotesubplan_path(root, path, NULL);
+
+ add_path(grouped_rel, (Path *)
+ create_agg_path(root,
+ grouped_rel,
+ path,
+ target,
+ AGG_HASHED,
+ AGGSPLIT_FINAL_DESERIAL,
+ parse->groupClause,
+ (List *) parse->havingQual,
+ &agg_final_costs,
+ dNumGroups));
+ }
+ }
+ }
+
+ /* Give a helpful error if we failed to find any implementation */
+ if (grouped_rel->pathlist == NIL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("could not implement GROUP BY"),
+ errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
+
+ /*
+ * If there is an FDW that's responsible for all baserels of the query,
+ * let it consider adding ForeignPaths.
+ */
+ if (grouped_rel->fdwroutine &&
+ grouped_rel->fdwroutine->GetForeignUpperPaths)
+ grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
+ input_rel, grouped_rel);
+
+ /* Let extensions possibly add some more paths */
+ if (create_upper_paths_hook)
+ (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
+ input_rel, grouped_rel);
+
+ /* Now choose the best path(s) */
+ set_cheapest(grouped_rel);
++ /*
++ * We've been using the partial pathlist for the grouped relation to hold
++ * partially aggregated paths, but that's actually a little bit bogus
++ * because it's unsafe for later planning stages -- like ordered_rel ---
++ * to get the idea that they can use these partial paths as if they didn't
++ * need a FinalizeAggregate step. Zap the partial pathlist at this stage
++ * so we don't get confused.
++ */
++ grouped_rel->partial_pathlist = NIL;
+
+ return grouped_rel;
+}
+
++
++/*
++ * For a given input path, consider the possible ways of doing grouping sets on
++ * it, by combinations of hashing and sorting. This can be called multiple
++ * times, so it's important that it not scribble on input. No result is
++ * returned, but any generated paths are added to grouped_rel.
++ */
++static void
++consider_groupingsets_paths(PlannerInfo *root,
++ RelOptInfo *grouped_rel,
++ Path *path,
++ bool is_sorted,
++ bool can_hash,
++ PathTarget *target,
++ grouping_sets_data *gd,
++ const AggClauseCosts *agg_costs,
++ double dNumGroups)
++{
++ Query *parse = root->parse;
++
++ /*
++ * If we're not being offered sorted input, then only consider plans that
++ * can be done entirely by hashing.
++ *
++ * We can hash everything if it looks like it'll fit in work_mem. But if
++ * the input is actually sorted despite not being advertised as such, we
++ * prefer to make use of that in order to use less memory.
++ *
++ * If none of the grouping sets are sortable, then ignore the work_mem
++ * limit and generate a path anyway, since otherwise we'll just fail.
++ */
++ if (!is_sorted)
++ {
++ List *new_rollups = NIL;
++ RollupData *unhashed_rollup = NULL;
++ List *sets_data;
++ List *empty_sets_data = NIL;
++ List *empty_sets = NIL;
++ ListCell *lc;
++ ListCell *l_start = list_head(gd->rollups);
++ AggStrategy strat = AGG_HASHED;
++ Size hashsize;
++ double exclude_groups = 0.0;
++
++ Assert(can_hash);
++
++ if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
++ {
++ unhashed_rollup = lfirst(l_start);
++ exclude_groups = unhashed_rollup->numGroups;
++ l_start = lnext(l_start);
++ }
++
++ hashsize = estimate_hashagg_tablesize(path,
++ agg_costs,
++ dNumGroups - exclude_groups);
++
++ /*
++ * gd->rollups is empty if we have only unsortable columns to work
++ * with. Override work_mem in that case; otherwise, we'll rely on the
++ * sorted-input case to generate usable mixed paths.
++ */
++ if (hashsize > work_mem * 1024L && gd->rollups)
++ return; /* nope, won't fit */
++
++ /*
++ * We need to burst the existing rollups list into individual grouping
++ * sets and recompute a groupClause for each set.
++ */
++ sets_data = list_copy(gd->unsortable_sets);
++
++ for_each_cell(lc, l_start)
++ {
++ RollupData *rollup = lfirst(lc);
+
+ /*
+ * If we find an unhashable rollup that's not been skipped by the
+ * "actually sorted" check above, we can't cope; we'd need sorted
+ * input (with a different sort order) but we can't get that here.
+ * So bail out; we'll get a valid path from the is_sorted case
+ * instead.
+ *
+ * The mere presence of empty grouping sets doesn't make a rollup
+ * unhashable (see preprocess_grouping_sets), we handle those
+ * specially below.
+ */
+ if (!rollup->hashable)
+ return;
+ else
+ sets_data = list_concat(sets_data, list_copy(rollup->gsets_data));
+ }
+ foreach(lc, sets_data)
+ {
+ GroupingSetData *gs = lfirst(lc);
+ List *gset = gs->set;
+ RollupData *rollup;
+
+ if (gset == NIL)
+ {
+ /* Empty grouping sets can't be hashed. */
+ empty_sets_data = lappend(empty_sets_data, gs);
+ empty_sets = lappend(empty_sets, NIL);
+ }
+ else
+ {
+ rollup = makeNode(RollupData);
+
+ rollup->groupClause = preprocess_groupclause(root, gset);
+ rollup->gsets_data = list_make1(gs);
+ rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+ rollup->gsets_data,
+ gd->tleref_to_colnum_map);
+ rollup->numGroups = gs->numGroups;
+ rollup->hashable = true;
+ rollup->is_hashed = true;
+ new_rollups = lappend(new_rollups, rollup);
+ }
+ }
+
+ /*
+ * If we didn't find anything nonempty to hash, then bail. We'll
+ * generate a path from the is_sorted case.
+ */
+ if (new_rollups == NIL)
+ return;
+
+ /*
+ * If there were empty grouping sets they should have been in the
+ * first rollup.
+ */
+ Assert(!unhashed_rollup || !empty_sets);
+
+ if (unhashed_rollup)
+ {
+ new_rollups = lappend(new_rollups, unhashed_rollup);
+ strat = AGG_MIXED;
+ }
+ else if (empty_sets)
+ {
+ RollupData *rollup = makeNode(RollupData);
+
+ rollup->groupClause = NIL;
+ rollup->gsets_data = empty_sets_data;
+ rollup->gsets = empty_sets;
+ rollup->numGroups = list_length(empty_sets);
+ rollup->hashable = false;
+ rollup->is_hashed = false;
+ new_rollups = lappend(new_rollups, rollup);
+ strat = AGG_MIXED;
+ }
+
+ add_path(grouped_rel, (Path *)
+ create_groupingsets_path(root,
+ grouped_rel,
+ path,
+ target,
+ (List *) parse->havingQual,
+ strat,
+ new_rollups,
+ agg_costs,
+ dNumGroups));
+ return;
+ }
+
+ /*
+ * If we have sorted input but nothing we can do with it, bail.
+ */
+ if (list_length(gd->rollups) == 0)
+ return;
+
+ /*
+ * Given sorted input, we try and make two paths: one sorted and one mixed
+ * sort/hash. (We need to try both because hashagg might be disabled, or
+ * some columns might not be sortable.)
+ *
+ * can_hash is passed in as false if some obstacle elsewhere (such as
+ * ordered aggs) means that we shouldn't consider hashing at all.
+ */
+ if (can_hash && gd->any_hashable)
+ {
+ List *rollups = NIL;
+ List *hash_sets = list_copy(gd->unsortable_sets);
+ double availspace = (work_mem * 1024.0);
+ ListCell *lc;
+
+ /*
+ * Account first for space needed for groups we can't sort at all.
+ */
+ availspace -= (double) estimate_hashagg_tablesize(path,
+ agg_costs,
+ gd->dNumHashGroups);
+
+ if (availspace > 0 && list_length(gd->rollups) > 1)
+ {
+ double scale;
+ int num_rollups = list_length(gd->rollups);
+ int k_capacity;
+ int *k_weights = palloc(num_rollups * sizeof(int));
+ Bitmapset *hash_items = NULL;
+ int i;
+
+ /*
+ * We treat this as a knapsack problem: the knapsack capacity
+ * represents work_mem, the item weights are the estimated memory
+ * usage of the hashtables needed to implement a single rollup,
+ * and we really ought to use the cost saving as the item value;
+ * however, currently the costs assigned to sort nodes don't
+ * reflect the comparison costs well, and so we treat all items as
+ * of equal value (each rollup we hash instead saves us one sort).
+ *
+ * To use the discrete knapsack, we need to scale the values to a
+ * reasonably small bounded range. We choose to allow a 5% error
+ * margin; we have no more than 4096 rollups in the worst possible
+ * case, which with a 5% error margin will require a bit over 42MB
+ * of workspace. (Anyone wanting to plan queries that complex had
+ * better have the memory for it. In more reasonable cases, with
+ * no more than a couple of dozen rollups, the memory usage will
+ * be negligible.)
+ *
+ * k_capacity is naturally bounded, but we clamp the values for
+ * scale and weight (below) to avoid overflows or underflows (or
+ * uselessly trying to use a scale factor less than 1 byte).
+ */
+ scale = Max(availspace / (20.0 * num_rollups), 1.0);
+ k_capacity = (int) floor(availspace / scale);
+
+ /*
+ * We leave the first rollup out of consideration since it's the
+ * one that matches the input sort order. We assign indexes "i"
+ * to only those entries considered for hashing; the second loop,
+ * below, must use the same condition.
+ */
+ i = 0;
+ for_each_cell(lc, lnext(list_head(gd->rollups)))
+ {
+ RollupData *rollup = lfirst(lc);
+
+ if (rollup->hashable)
+ {
+ double sz = estimate_hashagg_tablesize(path,
+ agg_costs,
+ rollup->numGroups);
+
+ /*
+ * If sz is enormous, but work_mem (and hence scale) is
+ * small, avoid integer overflow here.
+ */
+ k_weights[i] = (int) Min(floor(sz / scale),
+ k_capacity + 1.0);
+ ++i;
+ }
+ }
+
+ /*
+ * Apply knapsack algorithm; compute the set of items which
+ * maximizes the value stored (in this case the number of sorts
+ * saved) while keeping the total size (approximately) within
+ * capacity.
+ */
+ if (i > 0)
+ hash_items = DiscreteKnapsack(k_capacity, i, k_weights, NULL);
+
+ if (!bms_is_empty(hash_items))
+ {
+ rollups = list_make1(linitial(gd->rollups));
+
+ i = 0;
+ for_each_cell(lc, lnext(list_head(gd->rollups)))
+ {
+ RollupData *rollup = lfirst(lc);
+
+ if (rollup->hashable)
+ {
+ if (bms_is_member(i, hash_items))
+ hash_sets = list_concat(hash_sets,
+ list_copy(rollup->gsets_data));
+ else
+ rollups = lappend(rollups, rollup);
+ ++i;
+ }
+ else
+ rollups = lappend(rollups, rollup);
+ }
+ }
+ }
+
+ if (!rollups && hash_sets)
+ rollups = list_copy(gd->rollups);
+
+ foreach(lc, hash_sets)
+ {
+ GroupingSetData *gs = lfirst(lc);
+ RollupData *rollup = makeNode(RollupData);
+
+ Assert(gs->set != NIL);
+
+ rollup->groupClause = preprocess_groupclause(root, gs->set);
+ rollup->gsets_data = list_make1(gs);
+ rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+ rollup->gsets_data,
+ gd->tleref_to_colnum_map);
+ rollup->numGroups = gs->numGroups;
+ rollup->hashable = true;
+ rollup->is_hashed = true;
+ rollups = lcons(rollup, rollups);
+ }
+
+ if (rollups)
+ {
+ add_path(grouped_rel, (Path *)
+ create_groupingsets_path(root,
+ grouped_rel,
+ path,
+ target,
+ (List *) parse->havingQual,
+ AGG_MIXED,
+ rollups,
+ agg_costs,
+ dNumGroups));
+ }
+ }
+
+ /*
+ * Now try the simple sorted case.
+ */
+ if (!gd->unsortable_sets)
+ add_path(grouped_rel, (Path *)
+ create_groupingsets_path(root,
+ grouped_rel,
+ path,
+ target,
+ (List *) parse->havingQual,
+ AGG_SORTED,
+ gd->rollups,
+ agg_costs,
+ dNumGroups));
+ }
+
/*
* create_window_paths
*
return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
}
+
+/*
+ * grouping_distribution_match
+ * Check if the path distribution matches grouping distribution.
+ *
+ * Grouping preserves distribution if the distribution key is on of the
+ * grouping keys (arbitrary one). In that case it's guaranteed that groups
+ * on different nodes do not overlap, and we can push the aggregation to
+ * remote nodes as a whole.
+ *
+ * Otherwise we need to either fetch all the data to the coordinator and
+ * perform the aggregation there, or use two-phase aggregation, with the
+ * first phase (partial aggregation) pushed down, and the second phase
+ * (combining and finalizing the results) executed on the coordinator.
+ *
+ * XXX This is used not only for plain aggregation, but also for various
+ * other paths, relying on grouping infrastructure (DISTINCT ON, UNIQUE).
+ */
+static bool
+grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
+ List *clauses)
+{
+ int i;
+ bool matches_key = false;
+ Distribution *distribution = path->distribution;
+
+ int numGroupCols = list_length(clauses);
+ AttrNumber *groupColIdx = extract_grouping_cols(clauses,
+ parse->targetList);
+
+ /*
+ * With no explicit data distribution or replicated tables, we can simply
+ * push down the whole aggregation to the remote node, without any sort
+ * of redistribution. So consider this to be a match.
+ */
+ if ((distribution == NULL) ||
+ IsLocatorReplicated(distribution->distributionType))
+ return true;
+
+ /* But no distribution expression means 'no match'. */
+ if (distribution->distributionExpr == NULL)
+ return false;
+
+ /*
+ * With distributed data and table distributed using an expression, we
+ * need to check if the distribution expression matches one of the
+ * grouping keys (arbitrary one).
+ */
+ for (i = 0; i < numGroupCols; i++)
+ {
+ TargetEntry *te = (TargetEntry *)list_nth(parse->targetList,
+ groupColIdx[i]-1);
+
+ if (equal(te->expr, distribution->distributionExpr))
+ {
+ matches_key = true;
+ break;
+ }
+ }
+
+ return matches_key;
+}
+
+ /*
+ * get_partitioned_child_rels
+ * Returns a list of the RT indexes of the partitioned child relations
+ * with rti as the root parent RT index.
+ *
+ * Note: Only call this function on RTEs known to be partitioned tables.
+ */
+ List *
+ get_partitioned_child_rels(PlannerInfo *root, Index rti)
+ {
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, root->pcinfo_list)
+ {
+ PartitionedChildRelInfo *pc = lfirst(l);
+
+ if (pc->parent_relid == rti)
+ {
+ result = pc->child_rels;
+ break;
+ }
+ }
+
+ /* The root partitioned table is included as a child rel */
+ Assert(list_length(result) >= 1);
+
+ return result;
+ }
++
++
+static bool
+groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path)
+{
+ Distribution *distribution = path->distribution;
+
+ /*
+ * With no explicit data distribution or replicated tables, we can simply
+ * push down the whole grouping sets to the remote node, without any sort
+ * of redistribution. So consider this to be a match.
+ */
+ if ((distribution == NULL) ||
+ IsLocatorReplicated(distribution->distributionType))
+ return true;
+
+ return false;
+}
+
+/*
+ * equal_distributions
+ * Check that two distributions are equal.
+ *
+ * Distributions are considered equal if they are of the same type, on the
+ * same set of nodes, and if the distribution expressions are known to be equal
+ * (either the same expressions or members of the same equivalence class).
+ */
+static bool
+equal_distributions(PlannerInfo *root, Distribution *dst1,
+ Distribution *dst2)
+{
+ /* fast path */
+ if (dst1 == dst2)
+ return true;
+
+ if (dst1 == NULL || dst2 == NULL)
+ return false;
+
+ /* conditions easier to check go first */
+ if (dst1->distributionType != dst2->distributionType)
+ return false;
+
+ if (!bms_equal(dst1->nodes, dst2->nodes))
+ return false;
+
+ if (equal(dst1->distributionExpr, dst2->distributionExpr))
+ return true;
+
+ /*
+ * For more thorough expression check we need to ensure they both are
+ * defined
+ */
+ if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL)
+ return false;
+
+ /*
+ * More thorough check, but allows some important cases, like if
+ * distribution column is not updated (implicit set distcol=distcol) or
+ * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many
+ * applications.
+ */
+ if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr))
+ return true;
+
+ /* The restrictNodes field does not matter for distribution equality */
+ return false;
+}
+
+static Path *
+adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path)
+{
+ /* if the root distribution is NULL, set it to path distribution */
+ if (!root->distribution)
+ {
+ root->distribution = path->distribution;
+ return path;
+ }
+
+ /* don't touch paths without distribution attached (catalogs etc.) */
+ if ((path->distribution == NULL) && (root->distribution == NULL))
+ return path;
+
+ if (equal_distributions(root, root->distribution, path->distribution))
+ {
+ if (IsLocatorReplicated(path->distribution->distributionType) &&
+ contain_volatile_functions((Node *) parse->targetList))
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("can not update replicated table with result of volatile function")));
+
+ /*
+ * Source tuple will be consumed on the same node where it is
+ * produced, so if it is known that some node does not yield tuples
+ * we do not want to send subquery for execution on these nodes
+ * at all. So copy the restriction to the external distribution.
+ *
+ * XXX Is that ever possible if external restriction is already
+ * defined? If yes we probably should use intersection of the sets,
+ * and if resulting set is empty create dummy plan and set it as
+ * the result_plan. Need to think this over
+ */
+ root->distribution->restrictNodes =
+ bms_copy(path->distribution->restrictNodes);
+ }
+ else
+ {
+ /*
+ * If the planned statement is either UPDATE or DELETE, different
+ * distributions here mean the ModifyTable node will be placed on
+ * top of RemoteSubquery.
+ *
+ * UPDATE and DELETE versions of ModifyTable use TID of incoming
+ * tuple to apply the changes, but the RemoteSubquery plan supplies
+ * RemoteTuples, without such field. Therefore we can't execute
+ * such plan and error-out.
+ *
+ * Most common example is when the UPDATE statement modifies the
+ * distribution column, or when a complex UPDATE or DELETE statement
+ * involves a join. It's difficult to determine the exact reason,
+ * but we assume the first one (correlated UPDATE) is more likely.
+ *
+ * There are two ways of fixing the UPDATE ambiguity:
+ *
+ * 1. Modify the planner to never consider redistribution of the
+ * target table. In this case the planner would find there's no way
+ * to plan the query, and it would throw error somewhere else, and
+ * we'd only be dealing with updates of distribution columns.
+ *
+ * 2. Modify executor to allow distribution column updates. However
+ * there are a lot of issues behind the scene when implementing that
+ * approach, and so it's unlikely to happen soon.
+ *
+ * DELETE statements may only fail because of complex joins.
+ */
+
+ if (parse->commandType == CMD_UPDATE)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("could not plan this distributed update"),
+ errdetail("correlated UPDATE or updating distribution column currently not supported in Postgres-XL.")));
+
+ if (parse->commandType == CMD_DELETE)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("could not plan this distributed delete"),
+ errdetail("correlated or complex DELETE is currently not supported in Postgres-XL.")));
+
+ /*
+ * We already know the distributions are not equal, but let's see if
+ * the redistribution is actually necessary. We can skip it if we
+ * already have Result path, and if the distribution is one of
+ *
+ * a) 'hash' restricted to a single node
+ * b) 'replicate' without volatile functions in the target list
+ *
+ * In those cases we don't need the RemoteSubplan.
+ *
+ * XXX Not sure what the (result_plan->lefttree == NULL) does.
+ * See planner.c:2730 in 9.5.
+ */
+ if (!(IsA(path, ResultPath) && /* FIXME missing (result_plan->lefttree == NULL) condition */
+ ((root->distribution->distributionType == 'H' && bms_num_members(root->distribution->restrictNodes) == 1) ||
+ (root->distribution->distributionType == 'R' && !contain_mutable_functions((Node *)parse->targetList)))))
+
+ path = create_remotesubplan_path(root, path, root->distribution);
+ }
+
+ return path;
+}
+
+static bool
+can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path)
+{
+ /* only called when constructing grouping paths */
+ Assert(parse->hasAggs || parse->groupClause);
+
+ if (parse->groupingSets)
+ return groupingsets_distribution_match(root, parse, path);
+
+ return grouping_distribution_match(root, parse, path, parse->groupClause);
+}
+
+static bool
+can_push_down_window(PlannerInfo *root, Path *path)
+{
+ /* */
+ if (! path->distribution)
+ return true;
+
+ return false;
+}
* Post-processing of a completed plan tree: fix references to subplan
* vars, compute regproc values for operators, etc
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* subselect.c
* Planning routines for subselects and parameters.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
break;
case RTE_JOIN:
case RTE_CTE:
+#ifdef XCP
+ case RTE_REMOTE_DUMMY:
+#endif
+ case RTE_NAMEDTUPLESTORE:
/* these can't contain any lateral references */
break;
}
break;
case RTE_JOIN:
case RTE_CTE:
+#ifdef XCP
+ case RTE_REMOTE_DUMMY:
+#endif
+ case RTE_NAMEDTUPLESTORE:
/* these shouldn't be marked LATERAL */
Assert(false);
break;
* the tlists for child tables to keep expand_targetlist happy. We do it like
* that because it's faster in typical non-inherited cases.
*
- *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* append relations, and thenceforth share code with the UNION ALL case.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* pathnode.c
* Routines to manipulate pathlists and create path nodes
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
pathnode->indexorderbycols = indexorderbycols;
pathnode->indexscandir = indexscandir;
- cost_index(pathnode, root, loop_count);
+#ifdef XCP
+ set_scanpath_distribution(root, rel, (Path *) pathnode);
+ if (indexclauses)
+ {
+ ListCell *lc;
+ foreach (lc, indexclauses)
+ {
+ RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+ restrict_distribution(root, ri, (Path *) pathnode);
+ }
+ }
+#endif
+ cost_index(pathnode, root, loop_count, partial_path);
return pathnode;
}
pathnode->path.parallel_workers = parallel_workers;
pathnode->path.pathkeys = NIL; /* result is always considered
* unsorted */
+#ifdef XCP
+ /*
+ * Append path is used to implement scans of inherited tables and some
+ * "set" operations, like UNION ALL. While all inherited tables should
+ * have the same distribution, UNION'ed queries may have different.
+ * When paths being appended have the same distribution it is OK to push
+ * Append down to the data nodes. If not, perform "coordinator" Append.
+ */
+
+ /* Special case of the dummy relation, if the subpaths list is empty */
+ if (subpaths)
+ {
+ /* Take distribution of the first node */
+ l = list_head(subpaths);
+ subpath = (Path *) lfirst(l);
+ distribution = copyObject(subpath->distribution);
+ /*
+ * Check remaining subpaths, if all distributions equal to the first set
+ * it as a distribution of the Append path; otherwise make up coordinator
+ * Append
+ */
+ while ((l = lnext(l)))
+ {
+ subpath = (Path *) lfirst(l);
+
+ /*
+ * For Append and MergeAppend paths, we are most often dealing with
+ * different relations, appended together. So its very likely that
+ * the distribution for each relation will have a different varno.
+ * But we should be able to push down Append and MergeAppend as
+ * long as rest of the distribution information matches.
+ *
+ * equalDistribution() compares everything except the varnos
+ */
+ if (equalDistribution(distribution, subpath->distribution))
+ {
+ /*
+ * Both distribution and subpath->distribution may be NULL at
+ * this point, or they both are not null.
+ */
+ if (distribution && subpath->distribution->restrictNodes)
+ distribution->restrictNodes = bms_union(
+ distribution->restrictNodes,
+ subpath->distribution->restrictNodes);
+ }
+ else
+ {
+ break;
+ }
+ }
+ if (l)
+ {
+ List *newsubpaths = NIL;
+ foreach(l, subpaths)
+ {
+ subpath = (Path *) lfirst(l);
+ if (subpath->distribution)
+ subpath = redistribute_path(NULL, subpath, NIL,
+ LOCATOR_TYPE_NONE, NULL,
+ NULL, NULL);
+ newsubpaths = lappend(newsubpaths, subpath);
+ }
+ subpaths = newsubpaths;
+ pathnode->path.distribution = NULL;
+ }
+ else
+ pathnode->path.distribution = distribution;
+ }
+#endif
++
+ pathnode->partitioned_rels = list_copy(partitioned_rels);
pathnode->subpaths = subpaths;
/*
required_outer);
pathnode->path.parallel_aware = false;
pathnode->path.parallel_safe = false;
- pathnode->path.parallel_workers = subpath->parallel_workers;
+ pathnode->path.parallel_workers = 0;
pathnode->path.pathkeys = NIL; /* Gather has unordered result */
+ /* distribution is the same as in the subpath */
+ pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
+
pathnode->subpath = subpath;
+ pathnode->num_workers = subpath->parallel_workers;
pathnode->single_copy = false;
- if (pathnode->path.parallel_workers == 0)
+ if (pathnode->num_workers == 0)
{
- pathnode->path.parallel_workers = 1;
pathnode->path.pathkeys = subpath->pathkeys;
+ pathnode->num_workers = 1;
pathnode->single_copy = true;
}
pathnode->innerjoinpath = inner_path;
pathnode->joinrestrictinfo = restrict_clauses;
- final_cost_nestloop(root, pathnode, workspace, sjinfo, semifactors);
+#ifdef XCP
+ pathnode->movedrestrictinfo = mclauses;
+
+ alternate = set_joinpath_distribution(root, pathnode);
+#endif
+ final_cost_nestloop(root, pathnode, workspace, extra);
- final_cost_nestloop(root, altpath, workspace, sjinfo, semifactors);
+#ifdef XCP
+ /*
+ * Also calculate costs of all alternates and return cheapest path
+ */
+ foreach(lc, alternate)
+ {
+ NestPath *altpath = (NestPath *) lfirst(lc);
++ final_cost_nestloop(root, altpath, workspace, extra);
+ if (altpath->path.total_cost < pathnode->path.total_cost)
+ pathnode = altpath;
+ }
+#endif
+
return pathnode;
}
pathnode->path_mergeclauses = mergeclauses;
pathnode->outersortkeys = outersortkeys;
pathnode->innersortkeys = innersortkeys;
+#ifdef XCP
+ alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
+#endif
+ /* pathnode->skip_mark_restore will be set by final_cost_mergejoin */
/* pathnode->materialize_inner will be set by final_cost_mergejoin */
- final_cost_mergejoin(root, pathnode, workspace, sjinfo);
+
+ final_cost_mergejoin(root, pathnode, workspace, extra);
- final_cost_mergejoin(root, altpath, workspace, sjinfo);
+#ifdef XCP
+ /*
+ * Also calculate costs of all alternates and return cheapest path
+ */
+ foreach(lc, alternate)
+ {
+ MergePath *altpath = (MergePath *) lfirst(lc);
++ final_cost_mergejoin(root, altpath, workspace, extra);
+ if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
+ pathnode = altpath;
+ }
+#endif
+
return pathnode;
}
pathnode->jpath.innerjoinpath = inner_path;
pathnode->jpath.joinrestrictinfo = restrict_clauses;
pathnode->path_hashclauses = hashclauses;
+#ifdef XCP
+ alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
+#endif
/* final_cost_hashjoin will fill in pathnode->num_batches */
- final_cost_hashjoin(root, pathnode, workspace, sjinfo, semifactors);
+
+ final_cost_hashjoin(root, pathnode, workspace, extra);
- final_cost_hashjoin(root, altpath, workspace, sjinfo, semifactors);
+#ifdef XCP
+ /*
+ * Calculate costs of all alternates and return cheapest path
+ */
+ foreach(lc, alternate)
+ {
+ HashPath *altpath = (HashPath *) lfirst(lc);
++ final_cost_hashjoin(root, altpath, workspace, extra);
+ if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
+ pathnode = altpath;
+ }
+#endif
+
return pathnode;
}
pathnode->path.parallel_workers = subpath->parallel_workers;
pathnode->subpath = subpath;
+ /* distribution is the same as in the subpath */
+ pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
+
+ /*
+ * Simplify callers by downgrading AGG_SORTED to AGG_PLAIN, and AGG_MIXED
+ * to AGG_HASHED, here if possible.
+ */
+ if (aggstrategy == AGG_SORTED &&
+ list_length(rollups) == 1 &&
+ ((RollupData *) linitial(rollups))->groupClause == NIL)
+ aggstrategy = AGG_PLAIN;
+
+ if (aggstrategy == AGG_MIXED &&
+ list_length(rollups) == 1)
+ aggstrategy = AGG_HASHED;
+
/*
* Output will be in sorted order by group_pathkeys if, and only if, there
* is a single rollup operation on a non-empty list of grouping
rel,
bpath->bitmapqual,
required_outer,
- loop_count);
+ loop_count, 0);
}
case T_SubqueryScan:
+#ifdef XCP
+ {
+ SubqueryScanPath *spath = (SubqueryScanPath *) path;
+
+ return (Path *) create_subqueryscan_path(root,
+ rel,
+ spath->subpath,
+ spath->path.pathkeys,
+ required_outer,
+ path->distribution);
+ }
+#else
{
SubqueryScanPath *spath = (SubqueryScanPath *) path;
* routines for accessing the system catalogs
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "parser/parse_relation.h"
#include "parser/parsetree.h"
#include "rewrite/rewriteManip.h"
+ #include "statistics/statistics.h"
#include "storage/bufmgr.h"
+ #include "utils/builtins.h"
#include "utils/lsyscache.h"
+ #include "utils/syscache.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
-
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#endif
/* GUC parameter */
int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION;
* contain optimizable statements, which we should transform.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/parser/analyze.c
return result;
}
- Node *parsetree = (Node *) lfirst(raw_parsetree_item);
- result = parse_analyze(parsetree, query, NULL, 0);
+#ifdef PGXC
+/*
+ * transformExecDirectStmt -
+ * transform an EXECUTE DIRECT Statement
+ *
+ * Handling is depends if we should execute on nodes or on Coordinator.
+ * To execute on nodes we return CMD_UTILITY query having one T_RemoteQuery node
+ * with the inner statement as a sql_command.
+ * If statement is to run on Coordinator we should parse inner statement and
+ * analyze resulting query tree.
+ */
+static Query *
+transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
+{
+ Query *result = makeNode(Query);
+ char *query = stmt->query;
+ List *nodelist = stmt->node_names;
+ RemoteQuery *step = makeNode(RemoteQuery);
+ bool is_local = false;
+ List *raw_parsetree_list;
+ ListCell *raw_parsetree_item;
+ char *nodename;
+ int nodeIndex;
+ char nodetype;
+
+ /* Support not available on Datanodes */
+ if (IS_PGXC_DATANODE)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("EXECUTE DIRECT cannot be executed on a Datanode")));
+
+ if (list_length(nodelist) > 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Support for EXECUTE DIRECT on multiple nodes is not available yet")));
+
+ Assert(list_length(nodelist) == 1);
+ Assert(IS_PGXC_COORDINATOR);
+
+ /* There is a single element here */
+ nodename = strVal(linitial(nodelist));
+#ifdef XCP
+ nodetype = PGXC_NODE_NONE;
+ nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
+ if (nodetype == PGXC_NODE_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ nodename)));
+#else
+ nodeoid = get_pgxc_nodeoid(nodename);
+
+ if (!OidIsValid(nodeoid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ nodename)));
+
+ /* Get node type and index */
+ nodetype = get_pgxc_nodetype(nodeoid);
+ nodeIndex = PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid));
+#endif
+
+ /* Check if node is requested is the self-node or not */
+ if (nodetype == PGXC_NODE_COORDINATOR && nodeIndex == PGXCNodeId - 1)
+ is_local = true;
+
+ /* Transform the query into a raw parse list */
+ raw_parsetree_list = pg_parse_query(query);
+
+ /* EXECUTE DIRECT can just be executed with a single query */
+ if (list_length(raw_parsetree_list) > 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("EXECUTE DIRECT cannot execute multiple queries")));
+
+ /*
+ * Analyze the Raw parse tree
+ * EXECUTE DIRECT is restricted to one-step usage
+ */
+ foreach(raw_parsetree_item, raw_parsetree_list)
+ {
++ RawStmt *parsetree = lfirst_node(RawStmt, raw_parsetree_item);
++ List *result_list = pg_analyze_and_rewrite(parsetree, query, NULL, 0, NULL);
++ result = linitial_node(Query, result_list);
+ }
+
+ /* Default list of parameters to set */
+ step->sql_statement = NULL;
+ step->exec_nodes = makeNode(ExecNodes);
+ step->combine_type = COMBINE_TYPE_NONE;
+ step->sort = NULL;
+ step->read_only = true;
+ step->force_autocommit = false;
+ step->cursor = NULL;
+
+ /* This is needed by executor */
+ step->sql_statement = pstrdup(query);
+ if (nodetype == PGXC_NODE_COORDINATOR)
+ step->exec_type = EXEC_ON_COORDS;
+ else
+ step->exec_type = EXEC_ON_DATANODES;
+
+ step->reduce_level = 0;
+ step->base_tlist = NIL;
+ step->outer_alias = NULL;
+ step->inner_alias = NULL;
+ step->outer_reduce_level = 0;
+ step->inner_reduce_level = 0;
+ step->outer_relids = NULL;
+ step->inner_relids = NULL;
+ step->inner_statement = NULL;
+ step->outer_statement = NULL;
+ step->join_condition = NULL;
+
+ /* Change the list of nodes that will be executed for the query and others */
+ step->force_autocommit = false;
+ step->combine_type = COMBINE_TYPE_SAME;
+ step->read_only = true;
+ step->exec_direct_type = EXEC_DIRECT_NONE;
+
+ /* Set up EXECUTE DIRECT flag */
+ if (is_local)
+ {
+ if (result->commandType == CMD_UTILITY)
+ step->exec_direct_type = EXEC_DIRECT_LOCAL_UTILITY;
+ else
+ step->exec_direct_type = EXEC_DIRECT_LOCAL;
+ }
+ else
+ {
+ switch(result->commandType)
+ {
+ case CMD_UTILITY:
+ step->exec_direct_type = EXEC_DIRECT_UTILITY;
+ break;
+ case CMD_SELECT:
+ step->exec_direct_type = EXEC_DIRECT_SELECT;
+ break;
+ case CMD_INSERT:
+ step->exec_direct_type = EXEC_DIRECT_INSERT;
+ break;
+ case CMD_UPDATE:
+ step->exec_direct_type = EXEC_DIRECT_UPDATE;
+ break;
+ case CMD_DELETE:
+ step->exec_direct_type = EXEC_DIRECT_DELETE;
+ break;
+ default:
+ Assert(0);
+ }
+ }
+
+ /* Build Execute Node list, there is a unique node for the time being */
+ step->exec_nodes->nodeList = lappend_int(step->exec_nodes->nodeList, nodeIndex);
+
+ if (!is_local)
+ result->utilityStmt = (Node *) step;
+
+ /*
+ * Reset the queryId since the caller would do that anyways.
+ */
+ result->queryId = 0;
+
+ return result;
+}
+
+#endif
/*
* Produce a string representation of a LockClauseStrength value.
* gram.y
* POSTGRESQL BISON rules/actions
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
struct ImportQual *importqual;
InsertStmt *istmt;
VariableSetStmt *vsetstmt;
+/* PGXC_BEGIN */
+ struct StmtMulti *stmtmulti;
+ DistributeBy *distby;
+ PGXCSubCluster *subclus;
+/* PGXC_END */
+ PartitionElem *partelem;
+ PartitionSpec *partspec;
+ PartitionBoundSpec *partboundspec;
+ RoleSpec *rolespec;
}
%type <node> stmt schema_stmt
CreateUserStmt CreateUserMappingStmt CreateRoleStmt CreatePolicyStmt
CreatedbStmt DeclareCursorStmt DefineStmt DeleteStmt DiscardStmt DoStmt
DropGroupStmt DropOpClassStmt DropOpFamilyStmt DropPLangStmt DropStmt
- DropAssertStmt DropTrigStmt DropRuleStmt DropCastStmt DropRoleStmt
- DropPolicyStmt DropUserStmt DropdbStmt DropTableSpaceStmt DropFdwStmt
+ DropAssertStmt DropCastStmt DropRoleStmt
+ DropUserStmt DropdbStmt DropTableSpaceStmt
DropTransformStmt
- DropForeignServerStmt DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
- DropUserMappingStmt ExplainStmt FetchStmt
++ DropUserMappingStmt ExplainStmt ExecDirectStmt FetchStmt
GrantStmt GrantRoleStmt ImportForeignSchemaStmt IndexStmt InsertStmt
ListenStmt LoadStmt LockStmt NotifyStmt ExplainableStmt PreparableStmt
CreateFunctionStmt AlterFunctionStmt ReindexStmt RemoveAggrStmt
DeallocateStmt PrepareStmt ExecuteStmt
DropOwnedStmt ReassignOwnedStmt
AlterTSConfigurationStmt AlterTSDictionaryStmt
+ BarrierStmt PauseStmt AlterNodeStmt CreateNodeStmt DropNodeStmt
+ CreateNodeGroupStmt DropNodeGroupStmt
CreateMatViewStmt RefreshMatViewStmt CreateAmStmt
+ CreatePublicationStmt AlterPublicationStmt
+ CreateSubscriptionStmt AlterSubscriptionStmt DropSubscriptionStmt
%type <node> select_no_parens select_with_parens select_clause
simple_select values_clause
%type <windef> window_definition over_clause window_specification
opt_frame_clause frame_extent frame_bound
%type <str> opt_existing_window_name
+/* PGXC_BEGIN */
+%type <str> opt_barrier_id OptDistributeType DistributeStyle OptDistKey
+%type <distby> OptDistributeBy OptDistributeByInternal
+%type <subclus> OptSubCluster OptSubClusterInternal
+/* PGXC_END */
%type <boolean> opt_if_not_exists
+ %type <ival> generated_when override_kind
+ %type <partspec> PartitionSpec OptPartitionSpec
+ %type <str> part_strategy
+ %type <partelem> part_elem
+ %type <list> part_params
+ %type <partboundspec> ForValues
+ %type <node> partbound_datum PartitionRangeDatum
+ %type <list> partbound_datum_list range_datum_list
/*
* Non-keyword token types. These are hard-wired into the "flex" lexer.
*/
/* ordinary key words in alphabetical order */
+/* PGXC - added DISTRIBUTE, DISTRIBUTED, DISTSYLE, DISTKEY, RANDOMLY, DIRECT, COORDINATOR, CLEAN, NODE, BARRIER */
%token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER
AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC
- ASSERTION ASSIGNMENT ASYMMETRIC AT ATTRIBUTE AUTHORIZATION
+ ASSERTION ASSIGNMENT ASYMMETRIC AT ATTACH ATTRIBUTE AUTHORIZATION
- BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
+ BACKWARD BARRIER BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT
BOOLEAN_P BOTH BY
CACHE CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P
- CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLOSE
+ CHARACTER CHARACTERISTICS CHECK CHECKPOINT CLASS CLEAN CLOSE
- CLUSTER COALESCE COLLATE COLLATION COLUMN COMMENT COMMENTS COMMIT
- COMMITTED CONCURRENTLY CONFIGURATION CONFLICT CONNECTION CONSTRAINT CONSTRAINTS
- CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE
+ CLUSTER COALESCE COLLATE COLLATION COLUMN COLUMNS COMMENT COMMENTS COMMIT
+ COMMITTED CONCURRENTLY CONFIGURATION CONFLICT CONNECTION CONSTRAINT
- CONSTRAINTS CONTENT_P CONTINUE_P CONVERSION_P COPY COST CREATE
++ CONSTRAINTS CONTENT_P CONTINUE_P CONVERSION_P COORDINATOR COPY COST CREATE
CROSS CSV CUBE CURRENT_P
CURRENT_CATALOG CURRENT_DATE CURRENT_ROLE CURRENT_SCHEMA
CURRENT_TIME CURRENT_TIMESTAMP CURRENT_USER CURSOR CYCLE
DATA_P DATABASE DAY_P DEALLOCATE DEC DECIMAL_P DECLARE DEFAULT DEFAULTS
DEFERRABLE DEFERRED DEFINER DELETE_P DELIMITER DELIMITERS DEPENDS DESC
- /* PGXC_BEGIN */
- DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTKEY DISTRIBUTE DISTRIBUTED
- DISTSTYLE DO DOCUMENT_P DOMAIN_P DOUBLE_P
- /* PGXC_END */
- DROP
- DETACH DICTIONARY DISABLE_P DISCARD DISTINCT DO DOCUMENT_P DOMAIN_P
++ DETACH DICTIONARY DIRECT DISABLE_P DISCARD DISTINCT DISTKEY DISTRIBUTE DISTRIBUTED DISTSTYLE DO DOCUMENT_P DOMAIN_P
+ DOUBLE_P DROP
EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EVENT EXCEPT
EXCLUDE EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN
MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
- NAME_P NAMES NATIONAL NATURAL NCHAR NEXT NO NODE NONE
- NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
++ NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NODE NONE
NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
NULLS_P NUMERIC
- OBJECT_P OF OFF OFFSET OIDS ON ONLY OPERATOR OPTION OPTIONS OR
- ORDER ORDINALITY OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER
+ OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR
+ ORDER ORDINALITY OUT_P OUTER_P OVER OVERLAPS OVERLAY OVERRIDING OWNED OWNER
- PARALLEL PARSER PARTIAL PARTITION PASSING PASSWORD PLACING PLANS POLICY
- POSITION PRECEDING PRECISION PRESERVE PREPARE PREPARED PRIMARY
+ PARALLEL PARSER PARTIAL PARTITION PASSING PASSWORD PAUSE PLACING PLANS POLICY
+ POSITION PRECEDING PRECISION PREFERRED PRESERVE PREPARE PREPARED PRIMARY
- PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM
+ PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM PUBLICATION
QUOTE
- RANDOMLY RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFRESH REINDEX
- RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA
- RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFERENCING
++ RANDOMLY RANGE READ REAL REASSIGN RECHECK RECURSIVE REF REFERENCES REFERENCING
+ REFRESH REINDEX RELATIVE_P RELEASE RENAME REPEATABLE REPLACE REPLICA
RESET RESTART RESTRICT RETURNING RETURNS REVOKE RIGHT ROLE ROLLBACK ROLLUP
ROW ROWS RULE
}
;
- /* the thrashing around here is to discard "empty" statements... */
+ /*
+ * At top level, we wrap each stmt with a RawStmt node carrying start location
+ * and length of the stmt's text. Notice that the start loc/len are driven
+ * entirely from semicolon locations (@2). It would seem natural to use
+ * @1 or @3 to get the true start location of a stmt, but that doesn't work
+ * for statements that can start with empty nonterminals (opt_with_clause is
+ * the main offender here); as noted in the comments for YYLLOC_DEFAULT,
+ * we'd get -1 for the location in such cases.
+ * We also take care to discard empty statements entirely.
+ */
stmtmulti: stmtmulti ';' stmt
{
++ /*
++ * XXX PG10MERGE: Looks like support for obtaining raw
++ * query string for individual commands is added in PG10.
++ * If so, we can make use of the same infrastructure.
++ *
++ * XXX The following gives a compilation WARNING because
++ * stmtmulti is defined as a List in PG10, but we have our
++ * own definition.
++ */
+ if ($1 != NIL)
+ {
+ /* update length of previous stmt */
+ updateRawStmtEnd(llast_node(RawStmt, $1), @2);
+ }
+ if ($3 != NULL)
+ {
+ char *query;
+ ListCell *last;
+ /*
+ * Because of the way multi-commands are parsed by the
+ * parser, when the earlier command was parsed and
+ * reduced to a 'stmtmulti', we did not have the
+ * end-of-the-query marker. But now that we have seen
+ * the ';' token, add '\0' at the corresponding offset
+ * to get a separated command.
+ */
+ if ($1 != NULL)
+ {
+ last = list_tail($1->queries);
+ query = palloc(@2 - $1->offset + 1);
+ memcpy(query, lfirst(last), @2 - $1->offset);
+ query[@2 - $1->offset] = '\0';
+ lfirst(last) = query;
+
+ query = scanner_get_query(@3, -1, yyscanner);
+ $1->offset = @2;
+ $1->parsetrees = lappend($1->parsetrees, $3);
+ $1->queries = lappend($1->queries, query);
+ $$ = $1;
+ }
+ /*
+ *
+ * If the earlier statements were all null, then we
+ * must initialise the StmtMulti structure and make
+ * singleton lists
+ */
+ else
+ {
+ StmtMulti *n = (StmtMulti *) palloc0(sizeof (StmtMulti));
+ query = scanner_get_query(@3, -1, yyscanner);
+ n->offset = @2;
+ n->parsetrees = list_make1($3);
+ n->queries = list_make1(query);
+ $$ = n;
+ }
+ }
+ if ($3 != NULL)
+ $$ = lappend($1, makeRawStmt($3, @2 + 1));
else
$$ = $1;
}
| stmt
{
+ if ($1 != NULL)
+ {
+ StmtMulti *n = (StmtMulti *) palloc0(sizeof (StmtMulti));
+ char *query = scanner_get_query(@1, -1, yyscanner);
+
+ /*
+ * Keep track of the offset where $1 started. We don't
+ * have the offset where it ends so we copy the entire
+ * query to the end. If later, we find a ';' followed
+ * by another command, we'll add the '\0' at the
+ * appropriate offset
+ *
+ * XXX May be there is a better way to get the matching
+ * portion of the query string, but this does the trick
+ * for regression as well as the problem we are trying
+ * to solve with multi-command queries
+ */
+ n->offset = @1;
+
+ /*
+ * Collect both parsetree as well as the original query
+ * that resulted in the parsetree
+ */
+ n->parsetrees = list_make1($1);
+ n->queries = list_make1(query);
+ $$ = n;
+ }
+ if ($1 != NULL)
+ $$ = list_make1(makeRawStmt($1, 0));
else
- $$ = NIL;
+ $$ = NULL;
}
;
| CreateFunctionStmt
| CreateGroupStmt
| CreateMatViewStmt
+ | CreateNodeGroupStmt
+ | CreateNodeStmt
| CreateOpClassStmt
| CreateOpFamilyStmt
+ | CreatePublicationStmt
| AlterOpFamilyStmt
| CreatePolicyStmt
| CreatePLangStmt
| DoStmt
| DropAssertStmt
| DropCastStmt
- | DropFdwStmt
- | DropForeignServerStmt
| DropGroupStmt
+ | DropNodeGroupStmt
+ | DropNodeStmt
| DropOpClassStmt
| DropOpFamilyStmt
| DropOwnedStmt
*****************************************************************************/
CreateStmt: CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
- OptInherit OptWith OnCommitOption OptTableSpace
+ OptInherit OptPartitionSpec OptWith OnCommitOption OptTableSpace
+/* PGXC_BEGIN */
+ OptDistributeBy OptSubCluster
+/* PGXC_END */
{
CreateStmt *n = makeNode(CreateStmt);
$4->relpersistence = $2;
n->relation = $4;
n->tableElts = $6;
n->inhRelations = $8;
+ n->partspec = $9;
n->ofTypename = NULL;
n->constraints = NIL;
- n->options = $9;
- n->oncommit = $10;
- n->tablespacename = $11;
+ n->options = $10;
+ n->oncommit = $11;
+ n->tablespacename = $12;
n->if_not_exists = false;
- n->distributeby = $12;
- n->subcluster = $13;
+/* PGXC_BEGIN */
+ if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ {
+ $4->relpersistence = RELPERSISTENCE_TEMP;
+ n->islocal = true;
+ }
+ n->relkind = RELKIND_RELATION;
++ n->distributeby = $13;
++ n->subcluster = $14;
+/* PGXC_END */
$$ = (Node *)n;
}
| CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name '('
- OptTableElementList ')' OptInherit OptWith OnCommitOption
- OptTableSpace
+ OptTableElementList ')' OptInherit OptPartitionSpec OptWith
+ OnCommitOption OptTableSpace
+/* PGXC_BEGIN */
+ OptDistributeBy OptSubCluster
+/* PGXC_END */
{
CreateStmt *n = makeNode(CreateStmt);
$7->relpersistence = $2;
n->relation = $7;
n->tableElts = $9;
n->inhRelations = $11;
+ n->partspec = $12;
n->ofTypename = NULL;
n->constraints = NIL;
- n->options = $12;
- n->oncommit = $13;
- n->tablespacename = $14;
+ n->options = $13;
+ n->oncommit = $14;
+ n->tablespacename = $15;
n->if_not_exists = true;
- n->distributeby = $15;
- n->subcluster = $16;
+/* PGXC_BEGIN */
+ if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ {
+ $7->relpersistence = RELPERSISTENCE_TEMP;
+ n->islocal = true;
+ }
+ n->relkind = RELKIND_RELATION;
++ n->distributeby = $16;
++ n->subcluster = $17;
+ if (n->inhRelations != NULL && n->distributeby != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ parser_errposition(exprLocation((Node *) n->distributeby))));
+/* PGXC_END */
$$ = (Node *)n;
}
| CREATE OptTemp TABLE qualified_name OF any_name
- OptTypedTableElementList OptWith OnCommitOption OptTableSpace
+ OptTypedTableElementList OptPartitionSpec OptWith OnCommitOption
+ OptTableSpace
+/* PGXC_BEGIN */
+ OptDistributeBy OptSubCluster
+/* PGXC_END */
{
CreateStmt *n = makeNode(CreateStmt);
$4->relpersistence = $2;
n->ofTypename = makeTypeNameFromNameList($6);
n->ofTypename->location = @6;
n->constraints = NIL;
- n->options = $8;
- n->oncommit = $9;
- n->tablespacename = $10;
+ n->options = $9;
+ n->oncommit = $10;
+ n->tablespacename = $11;
n->if_not_exists = false;
- n->distributeby = $11;
- n->subcluster = $12;
+/* PGXC_BEGIN */
+ if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ {
+ $4->relpersistence = RELPERSISTENCE_TEMP;
+ n->islocal = true;
+ }
+ n->relkind = RELKIND_RELATION;
++ n->distributeby = $12;
++ n->subcluster = $13;
+ if (n->inhRelations != NULL && n->distributeby != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ parser_errposition(exprLocation((Node *) n->distributeby))));
+/* PGXC_END */
$$ = (Node *)n;
}
| CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name OF any_name
- OptTypedTableElementList OptWith OnCommitOption OptTableSpace
+ OptTypedTableElementList OptPartitionSpec OptWith OnCommitOption
+ OptTableSpace
+/* PGXC_BEGIN */
+ OptDistributeBy OptSubCluster
+/* PGXC_END */
{
CreateStmt *n = makeNode(CreateStmt);
$7->relpersistence = $2;
n->ofTypename = makeTypeNameFromNameList($9);
n->ofTypename->location = @9;
n->constraints = NIL;
- n->options = $11;
- n->oncommit = $12;
- n->tablespacename = $13;
+ n->options = $12;
+ n->oncommit = $13;
+ n->tablespacename = $14;
n->if_not_exists = true;
- n->distributeby = $14;
- n->subcluster = $15;
+/* PGXC_BEGIN */
+ if ($2 == RELPERSISTENCE_LOCAL_TEMP)
+ {
+ $7->relpersistence = RELPERSISTENCE_TEMP;
+ n->islocal = true;
+ }
+ n->relkind = RELKIND_RELATION;
++ n->distributeby = $15;
++ n->subcluster = $16;
+ if (n->inhRelations != NULL && n->distributeby != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("CREATE TABLE cannot contains both an INHERITS and a DISTRIBUTE BY clause"),
+ parser_errposition(exprLocation((Node *) n->distributeby))));
+/* PGXC_END */
$$ = (Node *)n;
}
+ | CREATE OptTemp TABLE qualified_name PARTITION OF qualified_name
+ OptTypedTableElementList ForValues OptPartitionSpec OptWith
+ OnCommitOption OptTableSpace
+ {
+ CreateStmt *n = makeNode(CreateStmt);
+ $4->relpersistence = $2;
+ n->relation = $4;
+ n->tableElts = $8;
+ n->inhRelations = list_make1($7);
+ n->partbound = $9;
+ n->partspec = $10;
+ n->ofTypename = NULL;
+ n->constraints = NIL;
+ n->options = $11;
+ n->oncommit = $12;
+ n->tablespacename = $13;
+ n->if_not_exists = false;
+ $$ = (Node *)n;
+ }
+ | CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name PARTITION OF
+ qualified_name OptTypedTableElementList ForValues OptPartitionSpec
+ OptWith OnCommitOption OptTableSpace
+ {
+ CreateStmt *n = makeNode(CreateStmt);
+ $7->relpersistence = $2;
+ n->relation = $7;
+ n->tableElts = $11;
+ n->inhRelations = list_make1($10);
+ n->partbound = $12;
+ n->partspec = $13;
+ n->ofTypename = NULL;
+ n->constraints = NIL;
+ n->options = $14;
+ n->oncommit = $15;
+ n->tablespacename = $16;
+ n->if_not_exists = true;
+ $$ = (Node *)n;
+ }
;
/*
| ASSERTION
| ASSIGNMENT
| AT
+ | ATTACH
| ATTRIBUTE
| BACKWARD
+/* PGXC_BEGIN */
+ | BARRIER
+/* PGXC_END */
| BEFORE
| BEGIN_P
| BY
| CHARACTERISTICS
| CHECKPOINT
| CLASS
+ | CLEAN
| CLOSE
| CLUSTER
+ | COLUMNS
| COMMENT
| COMMENTS
| COMMIT
| DELIMITER
| DELIMITERS
| DEPENDS
+ | DETACH
| DICTIONARY
+ | DIRECT
| DISABLE_P
| DISCARD
+/* PGXC_BEGIN */
+ | DISTKEY
+ | DISTRIBUTE
+ | DISTRIBUTED
+ | DISTSTYLE
+/* PGXC_END */
| DOCUMENT_P
| DOMAIN_P
| DOUBLE_P
| MOVE
| NAME_P
| NAMES
+ | NEW
| NEXT
| NO
+ | NODE
| NOTHING
| NOTIFY
| NOWAIT
| PROCEDURAL
| PROCEDURE
| PROGRAM
+ | PUBLICATION
| QUOTE
+/* PGXC_BEGIN */
+ | RANDOMLY
+/* PGXC_END */
| RANGE
| READ
| REASSIGN
* parse_agg.c
* handle aggregates and window functions in parser
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* parse_relation.c
* parser support routines dealing with relations
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/syscache.h"
+#ifdef XCP
+#include "utils/guc.h"
+#include "catalog/pg_statistic.h"
+#include "catalog/pg_namespace.h"
+#include "pgxc/pgxc.h"
+#include "miscadmin.h"
+#endif
+ #include "utils/varlena.h"
#define MAX_FUZZY_DISTANCE 3
* a quick copyObject() call before manipulating the query tree.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/backend/parser/parse_utilcmd.c
*
#include "catalog/pg_opclass.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_type.h"
+#ifdef XCP
+#include "catalog/pgxc_node.h"
+#endif
#include "commands/comment.h"
#include "commands/defrem.h"
+ #include "commands/sequence.h"
#include "commands/tablecmds.h"
#include "commands/tablespace.h"
#include "miscadmin.h"
List *alist; /* "after list" of things to do after creating
* the table */
IndexStmt *pkey; /* PRIMARY KEY index, if any */
+#ifdef PGXC
+ FallbackSrc fallback_source;
+ List *fallback_dist_cols;
+ DistributeBy *distributeby; /* original distribute by column of CREATE TABLE */
+ PGXCSubCluster *subcluster; /* original subcluster option of CREATE TABLE */
+#endif
+ bool ispartitioned; /* true if table is partitioned */
+ PartitionBoundSpec *partbound; /* transformed FOR VALUES */
} CreateStmtContext;
/* State shared by transformCreateSchemaStmt and its subroutines */
List *constraintList);
static void transformColumnType(CreateStmtContext *cxt, ColumnDef *column);
static void setSchemaName(char *context_schema, char **stmt_schema_name);
+#ifdef PGXC
+static void checkLocalFKConstraints(CreateStmtContext *cxt);
+#endif
+#ifdef XCP
+static List *transformSubclusterNodes(PGXCSubCluster *subcluster);
+static PGXCSubCluster *makeSubCluster(List *nodelist);
+#endif
+ static void transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd);
+ static Const *transformPartitionBoundValue(ParseState *pstate, A_Const *con,
+ const char *colName, Oid colType, int32 colTypmod);
+
/*
* transformCreateStmt -
cxt.blist = NIL;
cxt.alist = NIL;
cxt.pkey = NULL;
+#ifdef PGXC
+ cxt.fallback_source = FBS_NONE;
+ cxt.fallback_dist_cols = NIL;
+ cxt.distributeby = stmt->distributeby;
+ cxt.subcluster = stmt->subcluster;
+#endif
+ cxt.ispartitioned = stmt->partspec != NULL;
/*
* Notice that we allow OIDs here only for plain tables, even though
char *snamespace;
char *sname;
char *qstring;
- A_Const *snamenode;
+ A_Const *snamenode;
TypeCast *castnode;
FuncCall *funccallnode;
- CreateSeqStmt *seqstmt;
- AlterSeqStmt *altseqstmt;
- List *attnamelist;
-
- /*
- * Determine namespace and name to use for the sequence.
- *
- * Although we use ChooseRelationName, it's not guaranteed that the
- * selected sequence name won't conflict; given sufficiently long
- * field names, two different serial columns in the same table could
- * be assigned the same sequence name, and we'd not notice since we
- * aren't creating the sequence quite yet. In practice this seems
- * quite unlikely to be a problem, especially since few people would
- * need two serial columns in one table.
- */
- if (cxt->rel)
- snamespaceid = RelationGetNamespace(cxt->rel);
- else
- {
- snamespaceid = RangeVarGetCreationNamespace(cxt->relation);
- RangeVarAdjustRelationPersistence(cxt->relation, snamespaceid);
- }
- snamespace = get_namespace_name(snamespaceid);
- sname = ChooseRelationName(cxt->relation->relname,
- column->colname,
- "seq",
- snamespaceid);
-
- ereport(DEBUG1,
- (errmsg("%s will create implicit sequence \"%s\" for serial column \"%s.%s\"",
- cxt->stmtType, sname,
- cxt->relation->relname, column->colname)));
-
- /*
- * Build a CREATE SEQUENCE command to create the sequence object, and
- * add it to the list of things to be done before this CREATE/ALTER
- * TABLE.
- */
- seqstmt = makeNode(CreateSeqStmt);
- seqstmt->sequence = makeRangeVar(snamespace, sname, -1);
- seqstmt->options = NIL;
- #ifdef PGXC
- seqstmt->is_serial = true;
- #endif
-
- /*
- * If this is ALTER ADD COLUMN, make sure the sequence will be owned
- * by the table's owner. The current user might be someone else
- * (perhaps a superuser, or someone who's only a member of the owning
- * role), but the SEQUENCE OWNED BY mechanisms will bleat unless table
- * and sequence have exactly the same owning role.
- */
- if (cxt->rel)
- seqstmt->ownerId = cxt->rel->rd_rel->relowner;
- else
- seqstmt->ownerId = InvalidOid;
-
- cxt->blist = lappend(cxt->blist, seqstmt);
-
- /*
- * Build an ALTER SEQUENCE ... OWNED BY command to mark the sequence
- * as owned by this column, and add it to the list of things to be
- * done after this CREATE/ALTER TABLE.
- */
- altseqstmt = makeNode(AlterSeqStmt);
- altseqstmt->sequence = makeRangeVar(snamespace, sname, -1);
- #ifdef PGXC
- altseqstmt->is_serial = true;
- #endif
- attnamelist = list_make3(makeString(snamespace),
- makeString(cxt->relation->relname),
- makeString(column->colname));
- altseqstmt->options = list_make1(makeDefElem("owned_by",
- (Node *) attnamelist));
+ Constraint *constraint;
- cxt->alist = lappend(cxt->alist, altseqstmt);
++ /* XXX XL 9.6 was setting stmt->is_serial. CHECK */
+ generateSerialExtraStmts(cxt, column,
+ column->typeName->typeOid, NIL, false,
+ &snamespace, &sname);
/*
* Create appropriate constraints for SERIAL. We do this in full,
cxt.blist = NIL;
cxt.alist = NIL;
cxt.pkey = NULL;
+#ifdef PGXC
+ cxt.fallback_source = FBS_NONE;
+ cxt.fallback_dist_cols = NIL;
+ cxt.distributeby = NULL;
+ cxt.subcluster = NULL;
+#endif
+ cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+ cxt.partbound = NULL;
/*
* The only subtypes that currently require parse transformation handling
*stmt_schema_name, context_schema)));
}
+#ifdef PGXC
+/*
+ * CheckLocalIndexColumn
+ *
+ * Checks whether or not the index can be safely enforced locally
+ */
+bool
+CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname)
+{
+ if (IsLocatorReplicated(loctype))
+ /* always safe */
+ return true;
+ if (loctype == LOCATOR_TYPE_RROBIN)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Cannot locally enforce a unique index on round robin distributed table.")));
+ else if (loctype == LOCATOR_TYPE_HASH || loctype == LOCATOR_TYPE_MODULO)
+ {
+ if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Given relation, find the index of the attribute in the primary key,
+ * which is the distribution key. Returns -1 if table is not a Hash/Modulo
+ * distributed, does not have a primary key or distribution key is not in the
+ * primary key (last should not happen).
+ */
+static int
+find_relation_pk_dist_index(Relation rel)
+{
+ int result = -1;
+ List *indexoidlist;
+ ListCell *indexoidscan;
+ int partAttNum = InvalidAttrNumber;
+ bool pk_found = false;
+
+ if (rel->rd_locator_info)
+ partAttNum = rel->rd_locator_info->partAttrNum;
+
+ if (partAttNum == InvalidAttrNumber)
+ return -1;
+
+ /*
+ * Look up the primary key
+ */
+ indexoidlist = RelationGetIndexList(rel);
+
+ foreach(indexoidscan, indexoidlist)
+ {
+ Oid indexoid = lfirst_oid(indexoidscan);
+ HeapTuple indexTuple;
+ Form_pg_index indexForm;
+
+ indexTuple = SearchSysCache1(INDEXRELID,
+ ObjectIdGetDatum(indexoid));
+ if (!HeapTupleIsValid(indexTuple)) /* should not happen */
+ elog(ERROR, "cache lookup failed for index %u", indexoid);
+ indexForm = ((Form_pg_index) GETSTRUCT(indexTuple));
+ if (indexForm->indisprimary)
+ {
+ int i;
+
+ pk_found = true;
+
+ /*
+ * Loop over index attributes to find
+ * the distribution key
+ */
+ for (i = 0; i < indexForm->indnatts; i++)
+ {
+ if (indexForm->indkey.values[i] == partAttNum)
+ {
+ result = i;
+ break;
+ }
+ }
+ }
+ ReleaseSysCache(indexTuple);
+ if (pk_found)
+ break;
+ }
+
+ list_free(indexoidlist);
+
+ return result;
+}
+
+/*
+ * check to see if the constraint can be enforced locally
+ * if not, an error will be thrown
+ */
+static void
+checkLocalFKConstraints(CreateStmtContext *cxt)
+{
+ ListCell *fkclist;
+ List *nodelist = NIL;
+
+ if (cxt->subcluster)
+ nodelist = transformSubclusterNodes(cxt->subcluster);
+
+ foreach(fkclist, cxt->fkconstraints)
+ {
+ Constraint *constraint;
+ Oid pk_rel_id;
+ RelationLocInfo *rel_loc_info;
+ constraint = (Constraint *) lfirst(fkclist);
+
+ /*
+ * If constraint references to the table itself, it is safe
+ * Check if relation name is the same
+ * XCTODO: NO! It is only safe if table is replicated
+ * or distributed on primary key
+ */
+ if (constraint->pktable &&
+ strcmp(constraint->pktable->relname,cxt->relation->relname) == 0)
+ {
+ /* Is namespace also the same ? */
+ char *fkcon_schemaname = NULL;
+
+ if (!cxt->relation->schemaname &&
+ !constraint->pktable->schemaname)
+ continue;
+
+ if (!constraint->pktable->schemaname)
+ {
+ /* Schema name is not defined, look for current one */
+ List *search_path = fetch_search_path(false);
+ fkcon_schemaname = get_namespace_name(linitial_oid(search_path));
+ list_free(search_path);
+ }
+ else
+ fkcon_schemaname = constraint->pktable->schemaname;
+
+ /*
+ * If schema name and relation name are the same, table
+ * references to itself, so constraint is safe
+ */
+ if (fkcon_schemaname &&
+ strcmp(fkcon_schemaname,
+ cxt->relation->schemaname) == 0)
+ {
+ /* check if bad distribution is already defined */
+ if ((cxt->distributeby && cxt->distributeby->disttype != DISTTYPE_REPLICATION) ||
+ (cxt->isalter && cxt->rel->rd_locator_info != NULL && !IsLocatorReplicated(cxt->rel->rd_locator_info->locatorType)))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("only replicated table can reference itself")));
+ /* Record that replication is required */
+ cxt->fallback_source = FBS_REPLICATE;
+ if (cxt->fallback_dist_cols)
+ {
+ list_free_deep(cxt->fallback_dist_cols);
+ cxt->fallback_dist_cols = NULL;
+ }
+ continue;
+ }
+ }
+
+ pk_rel_id = RangeVarGetRelid(constraint->pktable, NoLock, false);
+ rel_loc_info = GetRelationLocInfo(pk_rel_id);
+ /* If referenced table is replicated, the constraint is safe */
+ if (rel_loc_info == NULL || IsLocatorReplicated(rel_loc_info->locatorType))
+ {
+ List *common;
+
+ if (cxt->subcluster)
+ {
+ /*
+ * Distribution nodes are defined, they must be a subset of
+ * the referenced relation's nodes
+ */
+ common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ if (list_length(common) < list_length(nodelist))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced table is not defined on all target nodes")));
+ list_free(common);
+ }
+ else
+ {
+ /* suggest distribution */
+ if (nodelist)
+ {
+ common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ if (list_length(common) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced tables is defined on different nodes")));
+ list_free(nodelist);
+ nodelist = common;
+ }
+ else
+ nodelist = rel_loc_info? list_copy(rel_loc_info->rl_nodeList):NIL;
+ }
+ }
+ else if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Cannot reference a round robin table in a foreign key constraint")));
+ }
+ else if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
+ {
+ ListCell *fklc;
+ ListCell *pklc;
+ char ltype;
+ char *lattr;
+ bool found = false;
+ List *common;
+
+ /*
+ * First check nodes, they must be the same as in
+ * the referenced relation
+ */
+ if (cxt->subcluster)
+ {
+ common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ if (list_length(common) != list_length(rel_loc_info->rl_nodeList) ||
+ list_length(common) != list_length(nodelist))
+ {
+ if (list_length(common) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced HASH/MODULO table must be defined on same nodes")));
+ }
+ list_free(common);
+ }
+ else
+ {
+ if (nodelist)
+ {
+ common = list_intersection_int(nodelist, rel_loc_info->rl_nodeList);
+ if (list_length(common) != list_length(rel_loc_info->rl_nodeList))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced HASH/MODULO table must be defined on same nodes")));
+ list_free(nodelist);
+ nodelist = common;
+ }
+ else
+ nodelist = list_copy(rel_loc_info->rl_nodeList);
+ /* Now define the subcluster */
+ cxt->subcluster = makeSubCluster(nodelist);
+ }
+
+ if (cxt->distributeby)
+ {
+ ltype = ConvertToLocatorType(cxt->distributeby->disttype);
+ lattr = cxt->distributeby->colname;
+ }
+ else if (cxt->isalter)
+ {
+ if (cxt->rel->rd_locator_info == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ ltype = cxt->rel->rd_locator_info->locatorType;
+ lattr = cxt->rel->rd_locator_info->partAttrName;
+ }
+ else
+ {
+ /*
+ * Not defined distribution, but we can define now.
+ * The distribution must be the same as in referenced table,
+ * distribution keys must be matching fk/pk
+ */
+ /*
+ * Can not define distribution by value already
+ */
+ if (cxt->fallback_source == FBS_REPLICATE)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ /* find the fk attribute matching the distribution column */
+ lattr = NULL;
+ if (list_length(constraint->pk_attrs) == 0)
+ {
+ /*
+ * PK attribute list may be missing, so FK must reference
+ * the primary table's primary key. The primary key may
+ * consist of multiple attributes, one of them is a
+ * distribution key. We should find the foreign attribute
+ * referencing that primary attribute and set it as the
+ * distribution key of the table.
+ */
+ int pk_attr_idx;
+ Relation rel;
+
+ rel = relation_open(pk_rel_id, AccessShareLock);
+ pk_attr_idx = find_relation_pk_dist_index(rel);
+ relation_close(rel, AccessShareLock);
+
+ if (pk_attr_idx >= 0 &&
+ pk_attr_idx < list_length(constraint->fk_attrs))
+ {
+ lattr = strVal(list_nth(constraint->fk_attrs, pk_attr_idx));
+ }
+ }
+ else
+ {
+ /*
+ * One of the primary attributes must be the primary
+ * tabble's distribution key. We should find the foreign
+ * attribute referencing that primary attribute and set it
+ * as the distribution key of the table.
+ */
+ forboth(fklc, constraint->fk_attrs,
+ pklc, constraint->pk_attrs)
+ {
+ if (strcmp(rel_loc_info->partAttrName,
+ strVal(lfirst(pklc))) == 0)
+ {
+ lattr = strVal(lfirst(fklc));
+ break;
+ }
+ }
+ }
+ /* distribution column is not referenced? */
+ if (lattr == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ foreach(fklc, cxt->fallback_dist_cols)
+ {
+ if (strcmp(lattr, (char *) lfirst(fklc)) == 0)
+ {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ {
+ list_free_deep(cxt->fallback_dist_cols);
+ cxt->fallback_dist_cols = NIL;
+ cxt->fallback_source = FBS_NONE;
+ cxt->distributeby = makeNode(DistributeBy);
+ switch (rel_loc_info->locatorType)
+ {
+ case LOCATOR_TYPE_HASH:
+ cxt->distributeby->disttype = DISTTYPE_HASH;
+ cxt->distributeby->colname = pstrdup(lattr);
+ break;
+ case LOCATOR_TYPE_MODULO:
+ cxt->distributeby->disttype = DISTTYPE_MODULO;
+ cxt->distributeby->colname = pstrdup(lattr);
+ break;
+ default:
+ /* can not happen ?*/
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ }
+ }
+ else /* dist attr is not found */
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ continue;
+ }
+ /*
+ * Here determine if already defined distribution is matching
+ * to distribution of primary table.
+ */
+ if (ltype != rel_loc_info->locatorType || lattr == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ if (list_length(constraint->pk_attrs) == 0)
+ {
+ /*
+ * PK attribute list may be missing, so FK must reference
+ * the primary table's primary key. The primary key may
+ * consist of multiple attributes, one of them is a
+ * distribution key. We should find the foreign attribute
+ * referencing that primary attribute and make sure it is a
+ * distribution key of the table.
+ */
+ int pk_attr_idx;
+ Relation rel;
+
+ rel = relation_open(pk_rel_id, AccessShareLock);
+ pk_attr_idx = find_relation_pk_dist_index(rel);
+ relation_close(rel, AccessShareLock);
+
+ /*
+ * Two first conditions are just avoid assertion failure in
+ * list_nth. First should never happen, because the primary key
+ * of hash/modulo distributed table must contain distribution
+ * key. Second may only happen if list of foreign columns is
+ * shorter then the primary key. In that case statement would
+ * probably fail later, but no harm if it fails here.
+ */
+ if (pk_attr_idx >= 0 &&
+ pk_attr_idx < list_length(constraint->fk_attrs) &&
+ strcmp(lattr, strVal(list_nth(constraint->fk_attrs,
+ pk_attr_idx))) == 0)
+ {
+ found = true;
+ }
+ }
+ else
+ {
+ forboth(fklc, constraint->fk_attrs, pklc, constraint->pk_attrs)
+ {
+ if (strcmp(lattr, strVal(lfirst(fklc))) == 0)
+ {
+ found = true;
+ if (strcmp(rel_loc_info->partAttrName,
+ strVal(lfirst(pklc))) == 0)
+ break;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ }
+ }
+ }
+ if (!found)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ }
+ else /* Unsupported distribution */
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Cannot reference a table with distribution type \"%c\"",
+ rel_loc_info->locatorType)));
+ }
+ }
+ /*
+ * If presence of a foreign constraint suggested a set of nodes, fix it here
+ */
+ if (nodelist && cxt->subcluster == NULL)
+ cxt->subcluster = makeSubCluster(nodelist);
+}
+#endif
+
+
+#ifdef XCP
+/*
+ * Convert SubCluster definition to a list of Datanode indexes, to compare to
+ * relation nodes
+ */
+static List *
+transformSubclusterNodes(PGXCSubCluster *subcluster)
+{
+ List *result = NIL;
+ Oid *nodeoids;
+ int numnodes;
+ int i;
+ char nodetype = PGXC_NODE_DATANODE;
+
+ nodeoids = GetRelationDistributionNodes(subcluster, &numnodes);
+ for (i = 0; i < numnodes; i++)
+ result = lappend_int(result, PGXCNodeGetNodeId(nodeoids[i], &nodetype));
+
+ return result;
+}
+
+
+/*
+ * Create a SubCluster definition from a list of node indexes.
+ */
+static PGXCSubCluster *
+makeSubCluster(List *nodelist)
+{
+ PGXCSubCluster *result;
+ ListCell *lc;
+ result = makeNode(PGXCSubCluster);
+ result->clustertype = SUBCLUSTER_NODE;
+ foreach (lc, nodelist)
+ {
+ int nodeidx = lfirst_int(lc);
+ char *nodename = get_pgxc_nodename(
+ PGXCNodeGetNodeOid(nodeidx, PGXC_NODE_DATANODE));
+ result->members = lappend(result->members, makeString(nodename));
+ }
+ return result;
+}
+#endif
++
+ /*
+ * transformPartitionCmd
+ * Analyze the ATTACH/DETACH PARTITION command
+ *
+ * In case of the ATTACH PARTITION command, cxt->partbound is set to the
+ * transformed value of cmd->bound.
+ */
+ static void
+ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd)
+ {
+ Relation parentRel = cxt->rel;
+
+ /* the table must be partitioned */
+ if (parentRel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("\"%s\" is not partitioned",
+ RelationGetRelationName(parentRel))));
+
+ /* transform the partition bound, if any */
+ Assert(RelationGetPartitionKey(parentRel) != NULL);
+ if (cmd->bound != NULL)
+ cxt->partbound = transformPartitionBound(cxt->pstate, parentRel,
+ cmd->bound);
+ }
+
+ /*
+ * transformPartitionBound
+ *
+ * Transform a partition bound specification
+ */
+ PartitionBoundSpec *
+ transformPartitionBound(ParseState *pstate, Relation parent,
+ PartitionBoundSpec *spec)
+ {
+ PartitionBoundSpec *result_spec;
+ PartitionKey key = RelationGetPartitionKey(parent);
+ char strategy = get_partition_strategy(key);
+ int partnatts = get_partition_natts(key);
+ List *partexprs = get_partition_exprs(key);
+
+ /* Avoid scribbling on input */
+ result_spec = copyObject(spec);
+
+ if (strategy == PARTITION_STRATEGY_LIST)
+ {
+ ListCell *cell;
+ char *colname;
+ Oid coltype;
+ int32 coltypmod;
+
+ if (spec->strategy != PARTITION_STRATEGY_LIST)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("invalid bound specification for a list partition"),
+ parser_errposition(pstate, exprLocation((Node *) spec))));
+
+ /* Get the only column's name in case we need to output an error */
+ if (key->partattrs[0] != 0)
+ colname = get_relid_attribute_name(RelationGetRelid(parent),
+ key->partattrs[0]);
+ else
+ colname = deparse_expression((Node *) linitial(partexprs),
+ deparse_context_for(RelationGetRelationName(parent),
+ RelationGetRelid(parent)),
+ false, false);
+ /* Need its type data too */
+ coltype = get_partition_col_typid(key, 0);
+ coltypmod = get_partition_col_typmod(key, 0);
+
+ result_spec->listdatums = NIL;
+ foreach(cell, spec->listdatums)
+ {
+ A_Const *con = castNode(A_Const, lfirst(cell));
+ Const *value;
+ ListCell *cell2;
+ bool duplicate;
+
+ value = transformPartitionBoundValue(pstate, con,
+ colname, coltype, coltypmod);
+
+ /* Don't add to the result if the value is a duplicate */
+ duplicate = false;
+ foreach(cell2, result_spec->listdatums)
+ {
+ Const *value2 = castNode(Const, lfirst(cell2));
+
+ if (equal(value, value2))
+ {
+ duplicate = true;
+ break;
+ }
+ }
+ if (duplicate)
+ continue;
+
+ result_spec->listdatums = lappend(result_spec->listdatums,
+ value);
+ }
+ }
+ else if (strategy == PARTITION_STRATEGY_RANGE)
+ {
+ ListCell *cell1,
+ *cell2;
+ int i,
+ j;
+ bool seen_unbounded;
+
+ if (spec->strategy != PARTITION_STRATEGY_RANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("invalid bound specification for a range partition"),
+ parser_errposition(pstate, exprLocation((Node *) spec))));
+
+ if (list_length(spec->lowerdatums) != partnatts)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("FROM must specify exactly one value per partitioning column")));
+ if (list_length(spec->upperdatums) != partnatts)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("TO must specify exactly one value per partitioning column")));
+
+ /*
+ * Check that no finite value follows an UNBOUNDED item in either of
+ * lower and upper bound lists.
+ */
+ seen_unbounded = false;
+ foreach(cell1, spec->lowerdatums)
+ {
+ PartitionRangeDatum *ldatum = castNode(PartitionRangeDatum,
+ lfirst(cell1));
+
+ if (ldatum->infinite)
+ seen_unbounded = true;
+ else if (seen_unbounded)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("cannot specify finite value after UNBOUNDED"),
+ parser_errposition(pstate, exprLocation((Node *) ldatum))));
+ }
+ seen_unbounded = false;
+ foreach(cell1, spec->upperdatums)
+ {
+ PartitionRangeDatum *rdatum = castNode(PartitionRangeDatum,
+ lfirst(cell1));
+
+ if (rdatum->infinite)
+ seen_unbounded = true;
+ else if (seen_unbounded)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("cannot specify finite value after UNBOUNDED"),
+ parser_errposition(pstate, exprLocation((Node *) rdatum))));
+ }
+
+ /* Transform all the constants */
+ i = j = 0;
+ result_spec->lowerdatums = result_spec->upperdatums = NIL;
+ forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
+ {
+ PartitionRangeDatum *ldatum = (PartitionRangeDatum *) lfirst(cell1);
+ PartitionRangeDatum *rdatum = (PartitionRangeDatum *) lfirst(cell2);
+ char *colname;
+ Oid coltype;
+ int32 coltypmod;
+ A_Const *con;
+ Const *value;
+
+ /* Get the column's name in case we need to output an error */
+ if (key->partattrs[i] != 0)
+ colname = get_relid_attribute_name(RelationGetRelid(parent),
+ key->partattrs[i]);
+ else
+ {
+ colname = deparse_expression((Node *) list_nth(partexprs, j),
+ deparse_context_for(RelationGetRelationName(parent),
+ RelationGetRelid(parent)),
+ false, false);
+ ++j;
+ }
+ /* Need its type data too */
+ coltype = get_partition_col_typid(key, i);
+ coltypmod = get_partition_col_typmod(key, i);
+
+ if (ldatum->value)
+ {
+ con = castNode(A_Const, ldatum->value);
+ value = transformPartitionBoundValue(pstate, con,
+ colname,
+ coltype, coltypmod);
+ if (value->constisnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("cannot specify NULL in range bound")));
+ ldatum = copyObject(ldatum); /* don't scribble on input */
+ ldatum->value = (Node *) value;
+ }
+
+ if (rdatum->value)
+ {
+ con = castNode(A_Const, rdatum->value);
+ value = transformPartitionBoundValue(pstate, con,
+ colname,
+ coltype, coltypmod);
+ if (value->constisnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("cannot specify NULL in range bound")));
+ rdatum = copyObject(rdatum); /* don't scribble on input */
+ rdatum->value = (Node *) value;
+ }
+
+ result_spec->lowerdatums = lappend(result_spec->lowerdatums,
+ ldatum);
+ result_spec->upperdatums = lappend(result_spec->upperdatums,
+ rdatum);
+
+ ++i;
+ }
+ }
+ else
+ elog(ERROR, "unexpected partition strategy: %d", (int) strategy);
+
+ return result_spec;
+ }
+
+ /*
+ * Transform one constant in a partition bound spec
+ */
+ static Const *
+ transformPartitionBoundValue(ParseState *pstate, A_Const *con,
+ const char *colName, Oid colType, int32 colTypmod)
+ {
+ Node *value;
+
+ /* Make it into a Const */
+ value = (Node *) make_const(pstate, &con->val, con->location);
+
+ /* Coerce to correct type */
+ value = coerce_to_target_type(pstate,
+ value, exprType(value),
+ colType,
+ colTypmod,
+ COERCION_ASSIGNMENT,
+ COERCE_IMPLICIT_CAST,
+ -1);
+
+ if (value == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("specified value cannot be cast to type %s for column \"%s\"",
+ format_type_be(colType), colName),
+ parser_errposition(pstate, con->location)));
+
+ /* Simplify the expression, in case we had a coercion */
+ if (!IsA(value, Const))
+ value = (Node *) expression_planner((Expr *) value);
+
+ /* Fail if we don't have a constant (i.e., non-immutable coercion) */
+ if (!IsA(value, Const))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("specified value cannot be cast to type %s for column \"%s\"",
+ format_type_be(colType), colName),
+ errdetail("The cast requires a non-immutable conversion."),
+ errhint("Try putting the literal value in single quotes."),
+ parser_errposition(pstate, con->location)));
+
+ return (Const *) value;
+ }
* raw_parser
* Given a query in string form, do lexical and grammatical analysis.
*
- * Returns a list of raw (un-analyzed) parse trees.
+ * Returns a list of raw (un-analyzed) parse trees. The immediate elements
+ * of the list are always RawStmt nodes.
*/
List *
-raw_parser(const char *str)
+raw_parser(const char *str, List **queries)
{
core_yyscan_t yyscanner;
base_yy_extra_type yyextra;
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * pause.c
+ *
+ * Cluster Pause/Unpause handling
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifdef XCP
+#include "postgres.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pause.h"
+#include "pgxc/pgxc.h"
++#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "miscadmin.h"
+
+/* globals */
+bool cluster_lock_held;
+bool cluster_ex_lock_held;
+
+static void HandleClusterPause(bool pause, bool initiator);
+static void ProcessClusterPauseRequest(bool pause);
+
+ClusterLockInfo *ClustLinfo = NULL;
+
+/*
+ * ProcessClusterPauseRequest:
+ *
+ * Carry out PAUSE/UNPAUSE request on a coordinator node
+ */
+static void
+ProcessClusterPauseRequest(bool pause)
+{
+ char *action = pause? "PAUSE":"UNPAUSE";
+
+ if (!IS_PGXC_COORDINATOR || !IsConnFromCoord())
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("The %s CLUSTER message is expected to "
+ "arrive at a coordinator from another coordinator",
+ action)));
+
+ elog(DEBUG2, "Received %s CLUSTER from a coordinator", action);
+
+ /*
+ * If calling UNPAUSE, ensure that the cluster lock has already been held
+ * in exclusive mode
+ */
+ if (!pause && !cluster_ex_lock_held)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Received an UNPAUSE request when cluster not PAUSED!")));
+
+ /*
+ * Enable/Disable local queries. We need to release the lock first
+ *
+ * TODO: Think of some timeout mechanism here, if the locking takes too
+ * much time...
+ */
+ ReleaseClusterLock(pause? false:true);
+ AcquireClusterLock(pause? true:false);
+
+ if (pause)
+ cluster_ex_lock_held = true;
+ else
+ cluster_ex_lock_held = false;
+
+ elog(DEBUG2, "%s queries at the coordinator", pause? "Paused":"Resumed");
+
+ return;
+}
+
+/*
+ * HandleClusterPause:
+ *
+ * Any errors will be reported via ereport.
+ */
+static void
+HandleClusterPause(bool pause, bool initiator)
+{
+ PGXCNodeAllHandles *coord_handles;
+ int conn;
+ int response;
+ char *action = pause? "PAUSE":"UNPAUSE";
+
+ elog(DEBUG2, "Preparing coordinators for %s CLUSTER", action);
+
+ if (pause && cluster_ex_lock_held)
+ {
+ ereport(NOTICE, (errmsg("CLUSTER already PAUSED")));
+
+ /* Nothing to do */
+ return;
+ }
+
+ if (!pause && !cluster_ex_lock_held)
+ {
+ ereport(NOTICE, (errmsg("Issue PAUSE CLUSTER before calling UNPAUSE")));
+
+ /* Nothing to do */
+ return;
+ }
+
+ /*
+ * If we are one of the participating coordinators, just do the action
+ * locally and return
+ */
+ if (!initiator)
+ {
+ ProcessClusterPauseRequest(pause);
+ return;
+ }
+
+ /*
+ * Send a PAUSE/UNPAUSE CLUSTER message to all the coordinators. We should send an
+ * asyncronous request, update the local ClusterLock and then wait for the remote
+ * coordinators to respond back
+ */
+
+ coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
+
+ for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ {
+ PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+
+ if (pgxc_node_send_query(handle, pause? "PAUSE CLUSTER" : "UNPAUSE CLUSTER") != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send %s CLUSTER request to some coordinator nodes",action)));
+ }
+
+ /*
+ * Disable/Enable local queries. We need to release the SHARED mode first
+ *
+ * TODO: Start a timer to cancel the request in case of a timeout
+ */
+ ReleaseClusterLock(pause? false:true);
+ AcquireClusterLock(pause? true:false);
+
+ if (pause)
+ cluster_ex_lock_held = true;
+ else
+ cluster_ex_lock_held = false;
+
+
+ elog(DEBUG2, "%s queries at the driving coordinator", pause? "Paused":"Resumed");
+
+ /*
+ * Local queries are paused/enabled. Check status of the remote coordinators
+ * now. We need a TRY/CATCH block here, so that if one of the coordinator
+ * fails for some reason, we can try best-effort to salvage the situation
+ * at others
+ *
+ * We hope that errors in the earlier loop generally do not occur (out of
+ * memory and improper handles..) or we can have a similar TRY/CATCH block
+ * there too
+ *
+ * To repeat: All the salvaging is best effort really...
+ */
+ PG_TRY();
+ {
+ ResponseCombiner combiner;
+
+ InitResponseCombiner(&combiner, coord_handles->co_conn_count, COMBINE_TYPE_NONE);
+ for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ {
+ PGXCNodeHandle *handle;
+
+ handle = coord_handles->coord_handles[conn];
+
+ while (true)
+ {
+ if (pgxc_node_receive(1, &handle, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to receive a response from the remote coordinator node")));
+
+ response = handle_response(handle, &combiner);
+ if (response == RESPONSE_EOF)
+ continue;
+ else if (response == RESPONSE_COMPLETE)
+ break;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("%s CLUSTER command failed "
+ "with error %s", action, handle->error)));
+ }
+ }
+
+ if (combiner.errorMessage)
+ {
+ char *code = combiner.errorCode;
+ if (combiner.errorDetail != NULL)
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner.errorMessage), errdetail("%s", combiner.errorDetail) ));
+ else
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner.errorMessage)));
+ }
+
+ CloseCombiner(&combiner);
+ }
+ PG_CATCH();
+ {
+ /*
+ * If PAUSE CLUSTER, issue UNPAUSE on the reachable nodes. For failure
+ * in cases of UNPAUSE, might need manual intervention at the offending
+ * coordinator node (maybe do a pg_cancel_backend() on the backend
+ * that's holding the exclusive lock or something..)
+ */
+ if (!pause)
+ ereport(WARNING,
+ (errmsg("UNPAUSE CLUSTER command failed on one or more coordinator nodes."
+ " Manual intervention may be required!")));
+ else
+ ereport(WARNING,
+ (errmsg("PAUSE CLUSTER command failed on one or more coordinator nodes."
+ " Trying to UNPAUSE reachable nodes now")));
+
+ for (conn = 0; conn < coord_handles->co_conn_count && pause; conn++)
+ {
+ PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+
+ (void) pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
+
+ /*
+ * The incoming data should hopefully be discarded as part of
+ * cleanup..
+ */
+ }
+
+ /* cleanup locally.. */
+ ReleaseClusterLock(pause? true:false);
+ AcquireClusterLock(pause? false:true);
+ cluster_ex_lock_held = false;
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ elog(DEBUG2, "Successfully completed %s CLUSTER command on "
+ "all coordinator nodes", action);
+
+ return;
+}
+
+void
+RequestClusterPause(bool pause, char *completionTag)
+{
+ char *action = pause? "PAUSE":"UNPAUSE";
+ bool initiator = true;
+
+ elog(DEBUG2, "%s CLUSTER request received", action);
+
+ /* Only a superuser can perform this activity on a cluster */
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("%s CLUSTER command: must be a superuser", action)));
+
+ /* Ensure that we are a coordinator */
+ if (!IS_PGXC_COORDINATOR)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("%s CLUSTER command must be sent to a coordinator", action)));
+
+ /*
+ * Did the command come directly to this coordinator or via another
+ * coordinator?
+ */
+ if (IsConnFromCoord())
+ initiator = false;
+
+ HandleClusterPause(pause, initiator);
+
+ if (completionTag)
+ snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "%s CLUSTER", action);
+}
+
+/*
+ * If the backend is shutting down, cleanup the PAUSE cluster lock
+ * appropriately. We do this before shutting down shmem, because this needs
+ * LWLock and stuff
+ */
+void
+PGXCCleanClusterLock(int code, Datum arg)
+{
+ PGXCNodeAllHandles *coord_handles;
+ int conn;
+
+ if (cluster_lock_held && !cluster_ex_lock_held)
+ {
+ ReleaseClusterLock (false);
+ cluster_lock_held = false;
+ }
+
+ /* Do nothing if cluster lock not held */
+ if (!cluster_ex_lock_held)
+ return;
+
+ /* Do nothing if we are not the initiator */
+ if (IsConnFromCoord())
+ return;
+
+ coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
+ /* Try best-effort to UNPAUSE other coordinators now */
+ for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ {
+ PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+
+ /* No error checking here... */
+ (void)pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
+ }
+
+ /* Release locally too. We do not want a dangling value in cl_holder_pid! */
+ ReleaseClusterLock(true);
+ cluster_ex_lock_held = false;
+}
+
+/* Report shared memory space needed by ClusterLockShmemInit */
+Size
+ClusterLockShmemSize(void)
+{
+ Size size = 0;
+
+ size = add_size(size, sizeof(ClusterLockInfo));
+
+ return size;
+}
+
+/* Allocate and initialize cluster locking related shared memory */
+void
+ClusterLockShmemInit(void)
+{
+ bool found;
+
+ ClustLinfo = (ClusterLockInfo *)
+ ShmemInitStruct("Cluster Lock Info", ClusterLockShmemSize(), &found);
+
+ if (!found)
+ {
+ /* First time through, so initialize */
+ MemSet(ClustLinfo, 0, ClusterLockShmemSize());
+ SpinLockInit(&ClustLinfo->cl_mutex);
+ }
+}
+
+/*
+ * AcquireClusterLock
+ *
+ * Based on the argument passed in, try to update the shared memory
+ * appropriately. In case the conditions cannot be satisfied immediately this
+ * function resorts to a simple sleep. We don't envision PAUSE CLUSTER to
+ * occur that frequently so most of the calls will come out immediately here
+ * without any sleeps at all
+ *
+ * We could have used a semaphore to allow the processes to sleep while the
+ * cluster lock is held. But again we are really not worried about performance
+ * and immediate wakeups around PAUSE CLUSTER functionality. Using the sleep
+ * in an infinite loop keeps things simple yet correct
+ */
+void
+AcquireClusterLock(bool exclusive)
+{
+ volatile ClusterLockInfo *clinfo = ClustLinfo;
+
+ if (exclusive && cluster_ex_lock_held)
+ {
+ return;
+ }
+
+ /*
+ * In the normal case, none of the backends will ask for exclusive lock, so
+ * they will just update the cl_process_count value and exit immediately
+ * from the below loop
+ */
+ for (;;)
+ {
+ bool wait = false;
+
+ SpinLockAcquire(&clinfo->cl_mutex);
+
+ if (!exclusive)
+ {
+ if (clinfo->cl_holder_pid == 0)
+ clinfo->cl_process_count++;
+ else
+ wait = true;
+ }
+ else /* PAUSE CLUSTER handling */
+ {
+ if (clinfo->cl_holder_pid != 0)
+ {
+ SpinLockRelease(&clinfo->cl_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("PAUSE CLUSTER already in progress")));
+ }
+
+ /*
+ * There should be no other process
+ * holding the lock including ourself
+ */
+ if (clinfo->cl_process_count > 0)
+ wait = true;
+ else
+ clinfo->cl_holder_pid = MyProcPid;
+ }
+ SpinLockRelease(&clinfo->cl_mutex);
+
+ /*
+ * We use a simple sleep mechanism. If PAUSE CLUSTER has been invoked,
+ * we are not worried about immediate performance characteristics..
+ */
+ if (wait)
+ {
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(100000L);
+ }
+ else /* Got the proper semantic read/write lock.. */
+ break;
+ }
+}
+
+/*
+ * ReleaseClusterLock
+ *
+ * Update the shared memory appropriately across the release call. We
+ * really do not need the bool argument, but it's there for some
+ * additional sanity checking
+ */
+void
+ReleaseClusterLock(bool exclusive)
+{
+ volatile ClusterLockInfo *clinfo = ClustLinfo;
+
+ SpinLockAcquire(&clinfo->cl_mutex);
+ if (exclusive)
+ {
+ if (clinfo->cl_process_count > 1 ||
+ clinfo->cl_holder_pid == 0)
+ {
+ SpinLockRelease(&clinfo->cl_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Inconsistent state while doing UNPAUSE CLUSTER")));
+ }
+
+ /*
+ * Reset the holder pid. Any waiters in AcquireClusterLock will
+ * eventually come out of their sleep and notice this new value and
+ * move ahead
+ */
+ clinfo->cl_holder_pid = 0;
+ }
+ else
+ {
+ if (clinfo->cl_holder_pid != 0)
+ {
+ SpinLockRelease(&clinfo->cl_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Inconsistent state while releasing CLUSTER lock")));
+ }
+ /*
+ * Decrement our count. If a PAUSE is waiting inside AcquireClusterLock
+ * elsewhere, it will wake out of sleep and do the needful
+ */
+ if (clinfo->cl_process_count > 0)
+ clinfo->cl_process_count--;
+ }
+ SpinLockRelease(&clinfo->cl_mutex);
+}
+#endif
--- /dev/null
- case INT2VECTOROID:
- return hashint2vector;
+/*-------------------------------------------------------------------------
+ *
+ * locator.c
+ * Functions that help manage table location information such as
+ * partitioning and replication information.
+ *
+ *
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "postgres.h"
+#include "access/skey.h"
+#include "access/gtm.h"
+#include "access/relscan.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_type.h"
+#include "nodes/pg_list.h"
+#include "nodes/nodeFuncs.h"
+#include "utils/builtins.h"
+#include "utils/catcache.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+#include "utils/tqual.h"
+#include "utils/syscache.h"
+#include "nodes/nodes.h"
+#include "optimizer/clauses.h"
+#include "parser/parse_coerce.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/locator.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
+
+#include "catalog/pgxc_class.h"
+#include "catalog/pgxc_node.h"
+#include "catalog/namespace.h"
+#include "access/hash.h"
+#ifdef XCP
+#include "utils/date.h"
+#include "utils/memutils.h"
+
+/*
+ * Locator details are private
+ */
+struct _Locator
+{
+ /*
+ * Determine target nodes for value.
+ * Resulting nodes are stored to the results array.
+ * Function returns number of node references written to the array.
+ */
+ int (*locatefunc) (Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+ Oid dataType; /* values of that type are passed to locateNodes function */
+ LocatorListType listType;
+ bool primary;
+ /* locator-specific data */
+ /* XXX: move them into union ? */
+ int roundRobinNode; /* for LOCATOR_TYPE_RROBIN */
+ LocatorHashFunc hashfunc; /* for LOCATOR_TYPE_HASH */
+ int valuelen; /* 1, 2 or 4 for LOCATOR_TYPE_MODULO */
+
+ int nodeCount; /* How many nodes are in the map */
+ void *nodeMap; /* map index to node reference according to listType */
+ void *results; /* array to output results */
+};
+#endif
+
+Oid primary_data_node = InvalidOid;
+int num_preferred_data_nodes = 0;
+Oid preferred_data_node[MAX_PREFERRED_NODES];
+
+#ifdef XCP
+static int modulo_value_len(Oid dataType);
+static LocatorHashFunc hash_func_ptr(Oid dataType);
+static int locate_static(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_roundrobin(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_modulo_random(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_hash_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_hash_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_modulo_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_modulo_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static Expr * pgxc_find_distcol_expr(Index varno,
+ AttrNumber attrNum,
+ Node *quals);
+#endif
+
+static const unsigned int xc_mod_m[] =
+{
+ 0x00000000, 0x55555555, 0x33333333, 0xc71c71c7,
+ 0x0f0f0f0f, 0xc1f07c1f, 0x3f03f03f, 0xf01fc07f,
+ 0x00ff00ff, 0x07fc01ff, 0x3ff003ff, 0xffc007ff,
+ 0xff000fff, 0xfc001fff, 0xf0003fff, 0xc0007fff,
+ 0x0000ffff, 0x0001ffff, 0x0003ffff, 0x0007ffff,
+ 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff,
+ 0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff,
+ 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff
+};
+
+static const unsigned int xc_mod_q[][6] =
+{
+ { 0, 0, 0, 0, 0, 0}, {16, 8, 4, 2, 1, 1}, {16, 8, 4, 2, 2, 2},
+ {15, 6, 3, 3, 3, 3}, {16, 8, 4, 4, 4, 4}, {15, 5, 5, 5, 5, 5},
+ {12, 6, 6, 6 , 6, 6}, {14, 7, 7, 7, 7, 7}, {16, 8, 8, 8, 8, 8},
+ { 9, 9, 9, 9, 9, 9}, {10, 10, 10, 10, 10, 10}, {11, 11, 11, 11, 11, 11},
+ {12, 12, 12, 12, 12, 12}, {13, 13, 13, 13, 13, 13}, {14, 14, 14, 14, 14, 14},
+ {15, 15, 15, 15, 15, 15}, {16, 16, 16, 16, 16, 16}, {17, 17, 17, 17, 17, 17},
+ {18, 18, 18, 18, 18, 18}, {19, 19, 19, 19, 19, 19}, {20, 20, 20, 20, 20, 20},
+ {21, 21, 21, 21, 21, 21}, {22, 22, 22, 22, 22, 22}, {23, 23, 23, 23, 23, 23},
+ {24, 24, 24, 24, 24, 24}, {25, 25, 25, 25, 25, 25}, {26, 26, 26, 26, 26, 26},
+ {27, 27, 27, 27, 27, 27}, {28, 28, 28, 28, 28, 28}, {29, 29, 29, 29, 29, 29},
+ {30, 30, 30, 30, 30, 30}, {31, 31, 31, 31, 31, 31}
+};
+
+static const unsigned int xc_mod_r[][6] =
+{
+ {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+ {0x0000ffff, 0x000000ff, 0x0000000f, 0x00000003, 0x00000001, 0x00000001},
+ {0x0000ffff, 0x000000ff, 0x0000000f, 0x00000003, 0x00000003, 0x00000003},
+ {0x00007fff, 0x0000003f, 0x00000007, 0x00000007, 0x00000007, 0x00000007},
+ {0x0000ffff, 0x000000ff, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f},
+ {0x00007fff, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f, 0x0000001f},
+ {0x00000fff, 0x0000003f, 0x0000003f, 0x0000003f, 0x0000003f, 0x0000003f},
+ {0x00003fff, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f},
+ {0x0000ffff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff},
+ {0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff},
+ {0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff, 0x000003ff},
+ {0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff, 0x000007ff},
+ {0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff, 0x00000fff},
+ {0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff, 0x00001fff},
+ {0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff, 0x00003fff},
+ {0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff},
+ {0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff},
+ {0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff, 0x0001ffff},
+ {0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff},
+ {0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff, 0x0007ffff},
+ {0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff},
+ {0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff},
+ {0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff, 0x003fffff},
+ {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff},
+ {0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff},
+ {0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff},
+ {0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff},
+ {0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff, 0x07ffffff},
+ {0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff},
+ {0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff},
+ {0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff, 0x3fffffff},
+ {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}
+};
+
+/*
+ * GetPreferredReplicationNode
+ * Pick any Datanode from given list, however fetch a preferred node first.
+ */
+List *
+GetPreferredReplicationNode(List *relNodes)
+{
+ ListCell *item;
+ int nodeid = -1;
+
+ if (list_length(relNodes) <= 0)
+ elog(ERROR, "a list of nodes should have at least one node");
+
+ foreach(item, relNodes)
+ {
+ int cnt_nodes;
+ char nodetype = PGXC_NODE_DATANODE;
+ for (cnt_nodes = 0;
+ cnt_nodes < num_preferred_data_nodes && nodeid < 0;
+ cnt_nodes++)
+ {
+ if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes],
+ &nodetype) == lfirst_int(item))
+ nodeid = lfirst_int(item);
+ }
+ if (nodeid >= 0)
+ break;
+ }
+ if (nodeid < 0)
+ return list_make1_int(list_nth_int(relNodes,
+ ((unsigned int) random()) % list_length(relNodes)));
+
+ return list_make1_int(nodeid);
+}
+
+/*
+ * GetAnyDataNode
+ * Pick any data node from given set, but try a preferred node
+ */
+int
+GetAnyDataNode(Bitmapset *nodes)
+{
+ Bitmapset *preferred = NULL;
+ int i, nodeid;
+ int nmembers = 0;
+ int members[NumDataNodes];
+
+ for (i = 0; i < num_preferred_data_nodes; i++)
+ {
+ char ntype = PGXC_NODE_DATANODE;
+ nodeid = PGXCNodeGetNodeId(preferred_data_node[i], &ntype);
+
+ /* OK, found one */
+ if (bms_is_member(nodeid, nodes))
+ preferred = bms_add_member(preferred, nodeid);
+ }
+
+ /*
+ * If no preferred data nodes or they are not in the desired set, pick up
+ * from the original set.
+ */
+ if (bms_is_empty(preferred))
+ preferred = bms_copy(nodes);
+
+ /*
+ * Load balance.
+ * We can not get item from the set, convert it to array
+ */
+ while ((nodeid = bms_first_member(preferred)) >= 0)
+ members[nmembers++] = nodeid;
+ bms_free(preferred);
+
+ /* If there is a single member nothing to balance */
+ if (nmembers == 1)
+ return members[0];
+
+ /*
+ * In general, the set may contain any number of nodes, and if we save
+ * previous returned index for load balancing the distribution won't be
+ * flat, because small set will probably reset saved value, and lower
+ * indexes will be picked up more often.
+ * So we just get a random value from 0..nmembers-1.
+ */
+ return members[((unsigned int) random()) % nmembers];
+}
+
+/*
+ * compute_modulo
+ * This function performs modulo in an optimized way
+ * It optimizes modulo of any positive number by
+ * 1,2,3,4,7,8,15,16,31,32,63,64 and so on
+ * for the rest of the denominators it uses % operator
+ * The optimized algos have been taken from
+ * http://www-graphics.stanford.edu/~seander/bithacks.html
+ */
+static int
+compute_modulo(unsigned int numerator, unsigned int denominator)
+{
+ unsigned int d;
+ unsigned int m;
+ unsigned int s;
+ unsigned int mask;
+ int k;
+ unsigned int q, r;
+
+ if (numerator == 0)
+ return 0;
+
+ /* Check if denominator is a power of 2 */
+ if ((denominator & (denominator - 1)) == 0)
+ return numerator & (denominator - 1);
+
+ /* Check if (denominator+1) is a power of 2 */
+ d = denominator + 1;
+ if ((d & (d - 1)) == 0)
+ {
+ /* Which power of 2 is this number */
+ s = 0;
+ mask = 0x01;
+ for (k = 0; k < 32; k++)
+ {
+ if ((d & mask) == mask)
+ break;
+ s++;
+ mask = mask << 1;
+ }
+
+ m = (numerator & xc_mod_m[s]) + ((numerator >> s) & xc_mod_m[s]);
+
+ for (q = 0, r = 0; m > denominator; q++, r++)
+ m = (m >> xc_mod_q[s][q]) + (m & xc_mod_r[s][r]);
+
+ m = m == denominator ? 0 : m;
+
+ return m;
+ }
+ return numerator % denominator;
+}
+
+/*
+ * GetRelationDistColumn - Returns the name of the hash or modulo distribution column
+ * First hash distribution is checked
+ * Retuens NULL if the table is neither hash nor modulo distributed
+ */
+char *
+GetRelationDistColumn(RelationLocInfo * rel_loc_info)
+{
+char *pColName;
+
+ pColName = NULL;
+
+ pColName = GetRelationHashColumn(rel_loc_info);
+ if (pColName == NULL)
+ pColName = GetRelationModuloColumn(rel_loc_info);
+
+ return pColName;
+}
+
+/*
+ * Returns whether or not the data type is hash distributable with PG-XC
+ * PGXCTODO - expand support for other data types!
+ */
+bool
+IsTypeHashDistributable(Oid col_type)
+{
+ return (hash_func_ptr(col_type) != NULL);
+}
+
+/*
+ * GetRelationHashColumn - return hash column for relation.
+ *
+ * Returns NULL if the relation is not hash partitioned.
+ */
+char *
+GetRelationHashColumn(RelationLocInfo * rel_loc_info)
+{
+ char *column_str = NULL;
+
+ if (rel_loc_info == NULL)
+ column_str = NULL;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ column_str = NULL;
+ else
+ {
+ int len = strlen(rel_loc_info->partAttrName);
+
+ column_str = (char *) palloc(len + 1);
+ strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+ }
+
+ return column_str;
+}
+
+/*
+ * IsHashColumn - return whether or not column for relation is hashed.
+ *
+ */
+bool
+IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
+{
+ bool ret_value = false;
+
+ if (!rel_loc_info || !part_col_name)
+ ret_value = false;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ ret_value = false;
+ else
+ ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
+
+ return ret_value;
+}
+
+
+/*
+ * IsHashColumnForRelId - return whether or not column for relation is hashed.
+ *
+ */
+bool
+IsHashColumnForRelId(Oid relid, char *part_col_name)
+{
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+
+ return IsHashColumn(rel_loc_info, part_col_name);
+}
+
+/*
+ * IsDistColumnForRelId - return whether or not column for relation is used for hash or modulo distribution
+ *
+ */
+bool
+IsDistColumnForRelId(Oid relid, char *part_col_name)
+{
+ bool bRet;
+ RelationLocInfo *rel_loc_info;
+
+ rel_loc_info = GetRelationLocInfo(relid);
+ bRet = false;
+
+ bRet = IsHashColumn(rel_loc_info, part_col_name);
+ if (bRet == false)
+ IsModuloColumn(rel_loc_info, part_col_name);
+ return bRet;
+}
+
+
+/*
+ * Returns whether or not the data type is modulo distributable with PG-XC
+ * PGXCTODO - expand support for other data types!
+ */
+bool
+IsTypeModuloDistributable(Oid col_type)
+{
+ return (modulo_value_len(col_type) != -1);
+}
+
+/*
+ * GetRelationModuloColumn - return modulo column for relation.
+ *
+ * Returns NULL if the relation is not modulo partitioned.
+ */
+char *
+GetRelationModuloColumn(RelationLocInfo * rel_loc_info)
+{
+ char *column_str = NULL;
+
+ if (rel_loc_info == NULL)
+ column_str = NULL;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+ column_str = NULL;
+ else
+ {
+ int len = strlen(rel_loc_info->partAttrName);
+
+ column_str = (char *) palloc(len + 1);
+ strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+ }
+
+ return column_str;
+}
+
+/*
+ * IsModuloColumn - return whether or not column for relation is used for modulo distribution.
+ *
+ */
+bool
+IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
+{
+ bool ret_value = false;
+
+ if (!rel_loc_info || !part_col_name)
+ ret_value = false;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+ ret_value = false;
+ else
+ ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
+
+ return ret_value;
+}
+
+
+/*
+ * IsModuloColumnForRelId - return whether or not column for relation is used for modulo distribution.
+ */
+bool
+IsModuloColumnForRelId(Oid relid, char *part_col_name)
+{
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+
+ return IsModuloColumn(rel_loc_info, part_col_name);
+}
+
+/*
+ * Update the round robin node for the relation
+ *
+ * PGXCTODO - may not want to bother with locking here, we could track
+ * these in the session memory context instead...
+ */
+int
+GetRoundRobinNode(Oid relid)
+{
+ int ret_node;
+ Relation rel = relation_open(relid, AccessShareLock);
+
+ Assert (IsLocatorReplicated(rel->rd_locator_info->locatorType) ||
+ rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN);
+
+ ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode);
+
+ /* Move round robin indicator to next node */
+ if (rel->rd_locator_info->roundRobinNode->next != NULL)
+ rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next;
+ else
+ /* reset to first one */
+ rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->rl_nodeList->head;
+
+ relation_close(rel, AccessShareLock);
+
+ return ret_node;
+}
+
+/*
+ * IsTableDistOnPrimary
+ *
+ * Does the table distribution list include the primary node?
+ */
+bool
+IsTableDistOnPrimary(RelationLocInfo *rel_loc_info)
+{
+ ListCell *item;
+
+ if (!OidIsValid(primary_data_node) ||
+ rel_loc_info == NULL ||
+ list_length(rel_loc_info->rl_nodeList = 0))
+ return false;
+
+ foreach(item, rel_loc_info->rl_nodeList)
+ {
+ char ntype = PGXC_NODE_DATANODE;
+ if (PGXCNodeGetNodeId(primary_data_node, &ntype) == lfirst_int(item))
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ * IsLocatorInfoEqual
+ * Check equality of given locator information
+ */
+bool
+IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2)
+{
+ List *nodeList1, *nodeList2;
+ Assert(rel_loc_info1 && rel_loc_info2);
+
+ nodeList1 = rel_loc_info1->rl_nodeList;
+ nodeList2 = rel_loc_info2->rl_nodeList;
+
+ /* Same relation? */
+ if (rel_loc_info1->relid != rel_loc_info2->relid)
+ return false;
+
+ /* Same locator type? */
+ if (rel_loc_info1->locatorType != rel_loc_info2->locatorType)
+ return false;
+
+ /* Same attribute number? */
+ if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum)
+ return false;
+
+ /* Same node list? */
+ if (list_difference_int(nodeList1, nodeList2) != NIL ||
+ list_difference_int(nodeList2, nodeList1) != NIL)
+ return false;
+
+ /* Everything is equal */
+ return true;
+}
+
+/*
+ * ConvertToLocatorType
+ * get locator distribution type
+ * We really should just have pgxc_class use disttype instead...
+ */
+char
+ConvertToLocatorType(int disttype)
+{
+ char loctype = LOCATOR_TYPE_NONE;
+
+ switch (disttype)
+ {
+ case DISTTYPE_HASH:
+ loctype = LOCATOR_TYPE_HASH;
+ break;
+ case DISTTYPE_ROUNDROBIN:
+ loctype = LOCATOR_TYPE_RROBIN;
+ break;
+ case DISTTYPE_REPLICATION:
+ loctype = LOCATOR_TYPE_REPLICATED;
+ break;
+ case DISTTYPE_MODULO:
+ loctype = LOCATOR_TYPE_MODULO;
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("Invalid distribution type")));
+ break;
+ }
+
+ return loctype;
+}
+
+
+/*
+ * GetLocatorType - Returns the locator type of the table
+ *
+ */
+char
+GetLocatorType(Oid relid)
+{
+ char ret = '\0';
+
+ RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid);
+
+ if (ret_loc_info != NULL)
+ ret = ret_loc_info->locatorType;
+
+ return ret;
+}
+
+
+/*
+ * Return a list of all Datanodes.
+ * We assume all tables use all nodes in the prototype, so just return a list
+ * from first one.
+ */
+List *
+GetAllDataNodes(void)
+{
+ int i;
+ List *nodeList = NIL;
+
+ for (i = 0; i < NumDataNodes; i++)
+ nodeList = lappend_int(nodeList, i);
+
+ return nodeList;
+}
+
+/*
+ * Return a list of all Coordinators
+ * This is used to send DDL to all nodes and to clean up pooler connections.
+ * Do not put in the list the local Coordinator where this function is launched.
+ */
+List *
+GetAllCoordNodes(void)
+{
+ int i;
+ List *nodeList = NIL;
+
+ for (i = 0; i < NumCoords; i++)
+ {
+ /*
+ * Do not put in list the Coordinator we are on,
+ * it doesn't make sense to connect to the local Coordinator.
+ */
+
+ if (i != PGXCNodeId - 1)
+ nodeList = lappend_int(nodeList, i);
+ }
+
+ return nodeList;
+}
+
+
+/*
+ * Build locator information associated with the specified relation.
+ */
+void
+RelationBuildLocator(Relation rel)
+{
+ Relation pcrel;
+ ScanKeyData skey;
+ SysScanDesc pcscan;
+ HeapTuple htup;
+ MemoryContext oldContext;
+ RelationLocInfo *relationLocInfo;
+ int j;
+ Form_pgxc_class pgxc_class;
+
+ ScanKeyInit(&skey,
+ Anum_pgxc_class_pcrelid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(RelationGetRelid(rel)));
+
+ pcrel = heap_open(PgxcClassRelationId, AccessShareLock);
+ pcscan = systable_beginscan(pcrel, PgxcClassPgxcRelIdIndexId, true,
+ SnapshotSelf, 1, &skey);
+ htup = systable_getnext(pcscan);
+
+ if (!HeapTupleIsValid(htup))
+ {
+ /* Assume local relation only */
+ rel->rd_locator_info = NULL;
+ systable_endscan(pcscan);
+ heap_close(pcrel, AccessShareLock);
+ return;
+ }
+
+ pgxc_class = (Form_pgxc_class) GETSTRUCT(htup);
+
+ oldContext = MemoryContextSwitchTo(CacheMemoryContext);
+
+ relationLocInfo = (RelationLocInfo *) palloc(sizeof(RelationLocInfo));
+ rel->rd_locator_info = relationLocInfo;
+
+ relationLocInfo->relid = RelationGetRelid(rel);
+ relationLocInfo->locatorType = pgxc_class->pclocatortype;
+
+ relationLocInfo->partAttrNum = pgxc_class->pcattnum;
+
+ relationLocInfo->partAttrName = get_attname(relationLocInfo->relid, pgxc_class->pcattnum);
+
+ relationLocInfo->rl_nodeList = NIL;
+
+ for (j = 0; j < pgxc_class->nodeoids.dim1; j++)
+ {
+ char ntype = PGXC_NODE_DATANODE;
+ int nid = PGXCNodeGetNodeId(pgxc_class->nodeoids.values[j], &ntype);
+ relationLocInfo->rl_nodeList = lappend_int(relationLocInfo->rl_nodeList, nid);
+ }
+
+ /*
+ * If the locator type is round robin, we set a node to
+ * use next time. In addition, if it is replicated,
+ * we choose a node to use for balancing reads.
+ */
+ if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN
+ || IsLocatorReplicated(relationLocInfo->locatorType))
+ {
+ int offset;
+ /*
+ * pick a random one to start with,
+ * since each process will do this independently
+ */
+ offset = compute_modulo(abs(rand()), list_length(relationLocInfo->rl_nodeList));
+
+ srand(time(NULL));
+ relationLocInfo->roundRobinNode = relationLocInfo->rl_nodeList->head; /* initialize */
+ for (j = 0; j < offset && relationLocInfo->roundRobinNode->next != NULL; j++)
+ relationLocInfo->roundRobinNode = relationLocInfo->roundRobinNode->next;
+ }
+
+ systable_endscan(pcscan);
+ heap_close(pcrel, AccessShareLock);
+
+ MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * GetLocatorRelationInfo - Returns the locator information for relation,
+ * in a copy of the RelationLocatorInfo struct in relcache
+ */
+RelationLocInfo *
+GetRelationLocInfo(Oid relid)
+{
+ RelationLocInfo *ret_loc_info = NULL;
+ Relation rel = relation_open(relid, AccessShareLock);
+
+ /* Relation needs to be valid */
+ Assert(rel->rd_isvalid);
+
+ if (rel->rd_locator_info)
+ ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info);
+
+ relation_close(rel, AccessShareLock);
+
+ return ret_loc_info;
+}
+
+/*
+ * Get the distribution type of relation.
+ */
+char
+GetRelationLocType(Oid relid)
+{
+ RelationLocInfo *locinfo = GetRelationLocInfo(relid);
+ if (!locinfo)
+ return LOCATOR_TYPE_NONE;
+
+ return locinfo->locatorType;
+}
+
+/*
+ * Copy the RelationLocInfo struct
+ */
+RelationLocInfo *
+CopyRelationLocInfo(RelationLocInfo * src_info)
+{
+ RelationLocInfo *dest_info;
+
+ Assert(src_info);
+
+ dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo));
+
+ dest_info->relid = src_info->relid;
+ dest_info->locatorType = src_info->locatorType;
+ dest_info->partAttrNum = src_info->partAttrNum;
+ if (src_info->partAttrName)
+ dest_info->partAttrName = pstrdup(src_info->partAttrName);
+
+ if (src_info->rl_nodeList)
+ dest_info->rl_nodeList = list_copy(src_info->rl_nodeList);
+ /* Note, for round robin, we use the relcache entry */
+
+ return dest_info;
+}
+
+
+/*
+ * Free RelationLocInfo struct
+ */
+void
+FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
+{
+ if (relationLocInfo)
+ {
+ if (relationLocInfo->partAttrName)
+ pfree(relationLocInfo->partAttrName);
+ pfree(relationLocInfo);
+ }
+}
+
+
+/*
+ * Free the contents of the ExecNodes expression */
+void
+FreeExecNodes(ExecNodes **exec_nodes)
+{
+ ExecNodes *tmp_en = *exec_nodes;
+
+ /* Nothing to do */
+ if (!tmp_en)
+ return;
+ list_free(tmp_en->primarynodelist);
+ list_free(tmp_en->nodeList);
+ pfree(tmp_en);
+ *exec_nodes = NULL;
+}
+
+
+#ifdef XCP
+/*
+ * Determine value length in bytes for specified type for a module locator.
+ * Return -1 if module locator is not supported for the type.
+ */
+static int
+modulo_value_len(Oid dataType)
+{
+ switch (dataType)
+ {
+ case BOOLOID:
+ case CHAROID:
+ return 1;
+ case INT2OID:
+ return 2;
+ case INT4OID:
+ case ABSTIMEOID:
+ case RELTIMEOID:
+ case DATEOID:
+ return 4;
+ default:
+ return -1;
+ }
+}
+
+
+static LocatorHashFunc
+hash_func_ptr(Oid dataType)
+{
+ switch (dataType)
+ {
+ case INT8OID:
+ case CASHOID:
+ return hashint8;
+ case INT2OID:
+ return hashint2;
+ case OIDOID:
+ return hashoid;
+ case INT4OID:
+ case ABSTIMEOID:
+ case RELTIMEOID:
+ case DATEOID:
+ return hashint4;
+ case BOOLOID:
+ case CHAROID:
+ return hashchar;
+ case NAMEOID:
+ return hashname;
+ case VARCHAROID:
+ case TEXTOID:
+ return hashtext;
+ case OIDVECTOROID:
+ return hashoidvector;
+ case BPCHAROID:
+ return hashbpchar;
+ case BYTEAOID:
+ return hashvarlena;
+ case TIMEOID:
+ return time_hash;
+ case TIMESTAMPOID:
+ case TIMESTAMPTZOID:
+ return timestamp_hash;
+ case INTERVALOID:
+ return interval_hash;
+ case TIMETZOID:
+ return timetz_hash;
+ case NUMERICOID:
+ return hash_numeric;
+ case UUIDOID:
+ return uuid_hash;
+ default:
+ return NULL;
+ }
+}
+
+
+Locator *
+createLocator(char locatorType, RelationAccessType accessType,
+ Oid dataType, LocatorListType listType, int nodeCount,
+ void *nodeList, void **result, bool primary)
+{
+ Locator *locator;
+ ListCell *lc;
+ void *nodeMap = NULL;
+ int i;
+
+ locator = (Locator *) palloc(sizeof(Locator));
+ locator->dataType = dataType;
+ locator->listType = listType;
+ locator->nodeCount = nodeCount;
+ /* Create node map */
+ switch (listType)
+ {
+ case LOCATOR_LIST_NONE:
+ /* No map, return indexes */
+ break;
+ case LOCATOR_LIST_INT:
+ /* Copy integer array */
+ nodeMap = palloc(nodeCount * sizeof(int));
+ memcpy(nodeMap, nodeList, nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ /* Copy array of Oids */
+ nodeMap = palloc(nodeCount * sizeof(Oid));
+ memcpy(nodeMap, nodeList, nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ /* Copy array of Oids */
+ nodeMap = palloc(nodeCount * sizeof(void *));
+ memcpy(nodeMap, nodeList, nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Create map from list */
+ {
+ List *l = (List *) nodeList;
+ locator->nodeCount = list_length(l);
+ if (IsA(l, IntList))
+ {
+ int *intptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(int));
+ intptr = (int *) nodeMap;
+ foreach(lc, l)
+ *intptr++ = lfirst_int(lc);
+ locator->listType = LOCATOR_LIST_INT;
+ }
+ else if (IsA(l, OidList))
+ {
+ Oid *oidptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(Oid));
+ oidptr = (Oid *) nodeMap;
+ foreach(lc, l)
+ *oidptr++ = lfirst_oid(lc);
+ locator->listType = LOCATOR_LIST_OID;
+ }
+ else if (IsA(l, List))
+ {
+ void **voidptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(void *));
+ voidptr = (void **) nodeMap;
+ foreach(lc, l)
+ *voidptr++ = lfirst(lc);
+ locator->listType = LOCATOR_LIST_POINTER;
+ }
+ else
+ {
+ /* can not get here */
+ Assert(false);
+ }
+ break;
+ }
+ }
+ /*
+ * Determine locatefunc, allocate results, set up parameters
+ * specific to locator type
+ */
+ switch (locatorType)
+ {
+ case LOCATOR_TYPE_REPLICATED:
+ if (accessType == RELATION_ACCESS_INSERT ||
+ accessType == RELATION_ACCESS_UPDATE ||
+ accessType == RELATION_ACCESS_READ_FQS)
+ {
+ locator->locatefunc = locate_static;
+ if (nodeMap == NULL)
+ {
+ /* no map, prepare array with indexes */
+ int *intptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(int));
+ intptr = (int *) nodeMap;
+ for (i = 0; i < locator->nodeCount; i++)
+ *intptr++ = i;
+ }
+ locator->nodeMap = nodeMap;
+ locator->results = nodeMap;
+ }
+ else
+ {
+ /* SELECT, use random node.. */
+ locator->locatefunc = locate_modulo_random;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ locator->roundRobinNode = -1;
+ }
+ break;
+ case LOCATOR_TYPE_RROBIN:
+ if (accessType == RELATION_ACCESS_INSERT)
+ {
+ locator->locatefunc = locate_roundrobin;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ locator->roundRobinNode = -1;
+ }
+ else
+ {
+ locator->locatefunc = locate_static;
+ if (nodeMap == NULL)
+ {
+ /* no map, prepare array with indexes */
+ int *intptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(int));
+ intptr = (int *) nodeMap;
+ for (i = 0; i < locator->nodeCount; i++)
+ *intptr++ = i;
+ }
+ locator->nodeMap = nodeMap;
+ locator->results = nodeMap;
+ }
+ break;
+ case LOCATOR_TYPE_HASH:
+ if (accessType == RELATION_ACCESS_INSERT)
+ {
+ locator->locatefunc = locate_hash_insert;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+ else
+ {
+ locator->locatefunc = locate_hash_select;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(locator->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(locator->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(locator->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+
+ locator->hashfunc = hash_func_ptr(dataType);
+ if (locator->hashfunc == NULL)
+ ereport(ERROR, (errmsg("Error: unsupported data type for HASH locator: %d\n",
+ dataType)));
+ break;
+ case LOCATOR_TYPE_MODULO:
+ if (accessType == RELATION_ACCESS_INSERT)
+ {
+ locator->locatefunc = locate_modulo_insert;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+ else
+ {
+ locator->locatefunc = locate_modulo_select;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(locator->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(locator->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(locator->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+
+ locator->valuelen = modulo_value_len(dataType);
+ if (locator->valuelen == -1)
+ ereport(ERROR, (errmsg("Error: unsupported data type for MODULO locator: %d\n",
+ dataType)));
+ break;
+ default:
+ ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n",
+ locatorType)));
+ }
+
+ if (result)
+ *result = locator->results;
+
+ return locator;
+}
+
+
+void
+freeLocator(Locator *locator)
+{
+ pfree(locator->nodeMap);
+ /*
+ * locator->nodeMap and locator->results may point to the same memory,
+ * do not free it twice
+ */
+ if (locator->results != locator->nodeMap)
+ pfree(locator->results);
+ pfree(locator);
+}
+
+
+/*
+ * Each time return the same predefined results
+ */
+static int
+locate_static(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ /* TODO */
+ if (hasprimary)
+ *hasprimary = false;
+ return self->nodeCount;
+}
+
+
+/*
+ * Each time return one next node, in round robin manner
+ */
+static int
+locate_roundrobin(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ /* TODO */
+ if (hasprimary)
+ *hasprimary = false;
+ if (++self->roundRobinNode >= self->nodeCount)
+ self->roundRobinNode = 0;
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = self->roundRobinNode;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] =
+ ((int *) self->nodeMap)[self->roundRobinNode];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] =
+ ((Oid *) self->nodeMap)[self->roundRobinNode];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] =
+ ((void **) self->nodeMap)[self->roundRobinNode];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+}
+
+/*
+ * Each time return one node, in a random manner
+ * This is similar to locate_modulo_select, but that
+ * function does not use a random modulo..
+ */
+static int
+locate_modulo_random(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ int offset;
+
+ if (hasprimary)
+ *hasprimary = false;
+
+ Assert(self->nodeCount > 0);
+ offset = compute_modulo(abs(rand()), self->nodeCount);
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = offset;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] =
+ ((int *) self->nodeMap)[offset];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] =
+ ((Oid *) self->nodeMap)[offset];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] =
+ ((void **) self->nodeMap)[offset];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+}
+
+/*
+ * Calculate hash from supplied value and use modulo by nodeCount as an index
+ */
+static int
+locate_hash_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ int index;
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ index = 0;
+ else
+ {
+ unsigned int hash32;
+
+ hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
+
+ index = compute_modulo(hash32, self->nodeCount);
+ }
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+}
+
+
+/*
+ * Calculate hash from supplied value and use modulo by nodeCount as an index
+ * if value is NULL assume no hint and return all the nodes.
+ */
+static int
+locate_hash_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ {
+ int i;
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ for (i = 0; i < self->nodeCount; i++)
+ ((int *) self->results)[i] = i;
+ break;
+ case LOCATOR_LIST_INT:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return self->nodeCount;
+ }
+ else
+ {
+ unsigned int hash32;
+ int index;
+
+ hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
+
+ index = compute_modulo(hash32, self->nodeCount);
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+ }
+}
+
+
+/*
+ * Use modulo of supplied value by nodeCount as an index
+ */
+static int
+locate_modulo_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ int index;
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ index = 0;
+ else
+ {
+ unsigned int mod32;
+
+ if (self->valuelen == 4)
+ mod32 = (unsigned int) (GET_4_BYTES(value));
+ else if (self->valuelen == 2)
+ mod32 = (unsigned int) (GET_2_BYTES(value));
+ else if (self->valuelen == 1)
+ mod32 = (unsigned int) (GET_1_BYTE(value));
+ else
+ mod32 = 0;
+
+ index = compute_modulo(mod32, self->nodeCount);
+ }
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+}
+
+
+/*
+ * Use modulo of supplied value by nodeCount as an index
+ * if value is NULL assume no hint and return all the nodes.
+ */
+static int
+locate_modulo_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ {
+ int i;
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ for (i = 0; i < self->nodeCount; i++)
+ ((int *) self->results)[i] = i;
+ break;
+ case LOCATOR_LIST_INT:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return self->nodeCount;
+ }
+ else
+ {
+ unsigned int mod32;
+ int index;
+
+ if (self->valuelen == 4)
+ mod32 = (unsigned int) (GET_4_BYTES(value));
+ else if (self->valuelen == 2)
+ mod32 = (unsigned int) (GET_2_BYTES(value));
+ else if (self->valuelen == 1)
+ mod32 = (unsigned int) (GET_1_BYTE(value));
+ else
+ mod32 = 0;
+
+ index = compute_modulo(mod32, self->nodeCount);
+
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+ }
+}
+
+
+int
+GET_NODES(Locator *self, Datum value, bool isnull, bool *hasprimary)
+{
+ return (*self->locatefunc) (self, value, isnull, hasprimary);
+}
+
+
+void *
+getLocatorResults(Locator *self)
+{
+ return self->results;
+}
+
+
+void *
+getLocatorNodeMap(Locator *self)
+{
+ return self->nodeMap;
+}
+
+
+int
+getLocatorNodeCount(Locator *self)
+{
+ return self->nodeCount;
+}
+#endif
+
+/*
+ * GetRelationNodes
+ *
+ * Get list of relation nodes
+ * If the table is replicated and we are reading, we can just pick one.
+ * If the table is partitioned, we apply partitioning column value, if possible.
+ *
+ * If the relation is partitioned, partValue will be applied if present
+ * (indicating a value appears for partitioning column), otherwise it
+ * is ignored.
+ *
+ * preferredNodes is only used when for replicated tables. If set, it will
+ * use one of the nodes specified if the table is replicated on it.
+ * This helps optimize for avoiding introducing additional nodes into the
+ * transaction.
+ *
+ * The returned List is a copy, so it should be freed when finished.
+ */
+ExecNodes *
+GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
+ bool isValueNull,
+ RelationAccessType accessType)
+{
+ ExecNodes *exec_nodes;
+ int *nodenums;
+ int i, count;
+ Locator *locator;
+ Oid typeOfValueForDistCol = InvalidOid;
+
+ if (rel_loc_info == NULL)
+ return NULL;
+
+
+ if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
+ {
+ /* A sufficient lock level needs to be taken at a higher level */
+ Relation rel = relation_open(rel_loc_info->relid, NoLock);
+ TupleDesc tupDesc = RelationGetDescr(rel);
+ Form_pg_attribute *attr = tupDesc->attrs;
+ /* Get the hash type of relation */
+ typeOfValueForDistCol = attr[rel_loc_info->partAttrNum - 1]->atttypid;
+ relation_close(rel, NoLock);
+ }
+
+ exec_nodes = makeNode(ExecNodes);
+ exec_nodes->baselocatortype = rel_loc_info->locatorType;
+ exec_nodes->accesstype = accessType;
+
+ locator = createLocator(rel_loc_info->locatorType,
+ accessType,
+ typeOfValueForDistCol,
+ LOCATOR_LIST_LIST,
+ 0,
+ (void *)rel_loc_info->rl_nodeList,
+ (void **)&nodenums,
+ false);
+ count = GET_NODES(locator, valueForDistCol, isValueNull, NULL);
+
+ for (i = 0; i < count; i++)
+ exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodenums[i]);
+
+ freeLocator(locator);
+ return exec_nodes;
+}
+
+/*
+ * GetRelationNodesByQuals
+ * A wrapper around GetRelationNodes to reduce the node list by looking at the
+ * quals. varno is assumed to be the varno of reloid inside the quals. No check
+ * is made to see if that's correct.
+ */
+ExecNodes *
+GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
+ Index varno, Node *quals, RelationAccessType relaccess)
+{
+ Expr *distcol_expr = NULL;
+ ExecNodes *exec_nodes;
+ Datum distcol_value;
+ bool distcol_isnull;
+
+ if (!rel_loc_info)
+ return NULL;
+ /*
+ * If the table distributed by value, check if we can reduce the Datanodes
+ * by looking at the qualifiers for this relation
+ */
+ if (IsRelationDistributedByValue(rel_loc_info))
+ {
+ Oid disttype = get_atttype(reloid, rel_loc_info->partAttrNum);
+ int32 disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum);
+ distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum,
+ quals);
+ /*
+ * If the type of expression used to find the Datanode, is not same as
+ * the distribution column type, try casting it. This is same as what
+ * will happen in case of inserting that type of expression value as the
+ * distribution column value.
+ */
+ if (distcol_expr)
+ {
+ distcol_expr = (Expr *)coerce_to_target_type(NULL,
+ (Node *)distcol_expr,
+ exprType((Node *)distcol_expr),
+ disttype, disttypmod,
+ COERCION_ASSIGNMENT,
+ COERCE_IMPLICIT_CAST, -1);
+ /*
+ * PGXC_FQS_TODO: We should set the bound parameters here, but we don't have
+ * PlannerInfo struct and we don't handle them right now.
+ * Even if constant expression mutator changes the expression, it will
+ * only simplify it, keeping the semantics same
+ */
+ distcol_expr = (Expr *)eval_const_expressions(NULL,
+ (Node *)distcol_expr);
+ }
+ }
+
+ if (distcol_expr && IsA(distcol_expr, Const))
+ {
+ Const *const_expr = (Const *)distcol_expr;
+ distcol_value = const_expr->constvalue;
+ distcol_isnull = const_expr->constisnull;
+ }
+ else
+ {
+ distcol_value = (Datum) 0;
+ distcol_isnull = true;
+ }
+
+ exec_nodes = GetRelationNodes(rel_loc_info, distcol_value,
+ distcol_isnull,
+ relaccess);
+ return exec_nodes;
+}
+
+/*
+ * GetRelationDistribColumn
+ * Return hash column name for relation or NULL if relation is not distributed.
+ */
+char *
+GetRelationDistribColumn(RelationLocInfo *locInfo)
+{
+ /* No relation, so simply leave */
+ if (!locInfo)
+ return NULL;
+
+ /* No distribution column if relation is not distributed with a key */
+ if (!IsRelationDistributedByValue(locInfo))
+ return NULL;
+
+ /* Return column name */
+ return get_attname(locInfo->relid, locInfo->partAttrNum);
+}
+
+/*
+ * pgxc_find_distcol_expr
+ * Search through the quals provided and find out an expression which will give
+ * us value of distribution column if exists in the quals. Say for a table
+ * tab1 (val int, val2 int) distributed by hash(val), a query "SELECT * FROM
+ * tab1 WHERE val = fn(x, y, z) and val2 = 3", fn(x,y,z) is the expression which
+ * decides the distribution column value in the rows qualified by this query.
+ * Hence return fn(x, y, z). But for a query "SELECT * FROM tab1 WHERE val =
+ * fn(x, y, z) || val2 = 3", there is no expression which decides the values
+ * distribution column val can take in the qualified rows. So, in such cases
+ * this function returns NULL.
+ */
+static Expr *
+pgxc_find_distcol_expr(Index varno,
+ AttrNumber attrNum,
+ Node *quals)
+{
+ List *lquals;
+ ListCell *qual_cell;
+
+ /* If no quals, no distribution column expression */
+ if (!quals)
+ return NULL;
+
+ /* Convert the qualification into List if it's not already so */
+ if (!IsA(quals, List))
+ lquals = make_ands_implicit((Expr *)quals);
+ else
+ lquals = (List *)quals;
+
+ /*
+ * For every ANDed expression, check if that expression is of the form
+ * <distribution_col> = <expr>. If so return expr.
+ */
+ foreach(qual_cell, lquals)
+ {
+ Expr *qual_expr = (Expr *)lfirst(qual_cell);
+ OpExpr *op;
+ Expr *lexpr;
+ Expr *rexpr;
+ Var *var_expr;
+ Expr *distcol_expr;
+
+ if (!IsA(qual_expr, OpExpr))
+ continue;
+ op = (OpExpr *)qual_expr;
+ /* If not a binary operator, it can not be '='. */
+ if (list_length(op->args) != 2)
+ continue;
+
+ lexpr = linitial(op->args);
+ rexpr = lsecond(op->args);
+
+ /*
+ * If either of the operands is a RelabelType, extract the Var in the RelabelType.
+ * A RelabelType represents a "dummy" type coercion between two binary compatible datatypes.
+ * If we do not handle these then our optimization does not work in case of varchar
+ * For example if col is of type varchar and is the dist key then
+ * select * from vc_tab where col = 'abcdefghijklmnopqrstuvwxyz';
+ * should be shipped to one of the nodes only
+ */
+ if (IsA(lexpr, RelabelType))
+ lexpr = ((RelabelType*)lexpr)->arg;
+ if (IsA(rexpr, RelabelType))
+ rexpr = ((RelabelType*)rexpr)->arg;
+
+ /*
+ * If either of the operands is a Var expression, assume the other
+ * one is distribution column expression. If none is Var check next
+ * qual.
+ */
+ if (IsA(lexpr, Var))
+ {
+ var_expr = (Var *)lexpr;
+ distcol_expr = rexpr;
+ }
+ else if (IsA(rexpr, Var))
+ {
+ var_expr = (Var *)rexpr;
+ distcol_expr = lexpr;
+ }
+ else
+ continue;
+ /*
+ * If Var found is not the distribution column of required relation,
+ * check next qual
+ */
+ if (var_expr->varno != varno || var_expr->varattno != attrNum)
+ continue;
+ /*
+ * If the operator is not an assignment operator, check next
+ * constraint. An operator is an assignment operator if it's
+ * mergejoinable or hashjoinable. Beware that not every assignment
+ * operator is mergejoinable or hashjoinable, so we might leave some
+ * oportunity. But then we have to rely on the opname which may not
+ * be something we know to be equality operator as well.
+ */
+ if (!op_mergejoinable(op->opno, exprType((Node *)lexpr)) &&
+ !op_hashjoinable(op->opno, exprType((Node *)lexpr)))
+ continue;
+ /* Found the distribution column expression return it */
+ return distcol_expr;
+ }
+ /* Exhausted all quals, but no distribution column expression */
+ return NULL;
+}
--- /dev/null
- /* Do the insertion */
- (void) simple_heap_insert(rel, tup);
-
- CatalogUpdateIndexes(rel, tup);
+/*-------------------------------------------------------------------------
+ *
+ * groupmgr.c
+ * Routines to support manipulation of the pgxc_group catalog
+ * This includes support for DDL on objects NODE GROUP
+ *
+ * Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "catalog/catalog.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "catalog/pgxc_group.h"
+#include "nodes/parsenodes.h"
+#include "nodes/pg_list.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/array.h"
+#include "pgxc/groupmgr.h"
+
+/*
+ * PgxcGroupCreate
+ *
+ * Create a PGXC node group
+ */
+void
+PgxcGroupCreate(CreateGroupStmt *stmt)
+{
+ const char *group_name = stmt->group_name;
+ List *nodes = stmt->nodes;
+ oidvector *nodes_array;
+ Oid *inTypes;
+ Relation rel;
+ HeapTuple tup;
+ bool nulls[Natts_pgxc_group];
+ Datum values[Natts_pgxc_group];
+ int member_count = list_length(stmt->nodes);
+ ListCell *lc;
+ int i = 0;
+
+ /* Only a DB administrator can add cluster node groups */
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to create cluster node groups")));
+
+ /* Check if given group already exists */
+ if (OidIsValid(get_pgxc_groupoid(group_name)))
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_OBJECT),
+ errmsg("PGXC Group %s: group already defined",
+ group_name)));
+
+ inTypes = (Oid *) palloc(member_count * sizeof(Oid));
+
+ /* Build list of Oids for each node listed */
+ foreach(lc, nodes)
+ {
+ char *node_name = strVal(lfirst(lc));
+ Oid noid = get_pgxc_nodeoid(node_name);
+
+ if (!OidIsValid(noid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ node_name)));
+
+ if (get_pgxc_nodetype(noid) != PGXC_NODE_DATANODE)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC node %s: only Datanodes can be group members",
+ node_name)));
+
+ /* OK to pick up Oid of this node */
+ inTypes[i] = noid;
+ i++;
+ }
+
+ /* Build array of Oids to be inserted */
+ nodes_array = buildoidvector(inTypes, member_count);
+
+ /* Iterate through all attributes initializing nulls and values */
+ for (i = 0; i < Natts_pgxc_group; i++)
+ {
+ nulls[i] = false;
+ values[i] = (Datum) 0;
+ }
+
+ /* Insert Data correctly */
+ values[Anum_pgxc_group_name - 1] =
+ DirectFunctionCall1(namein, CStringGetDatum(group_name));
+ values[Anum_pgxc_group_members - 1] = PointerGetDatum(nodes_array);
+
+ /* Open the relation for insertion */
+ rel = heap_open(PgxcGroupRelationId, RowExclusiveLock);
+ tup = heap_form_tuple(rel->rd_att, values, nulls);
+
++ CatalogTupleInsert(rel, tup);
+
+ heap_close(rel, RowExclusiveLock);
+}
+
+
+/*
+ * PgxcNodeGroupsRemove():
+ *
+ * Remove a PGXC node group
+ */
+void
+PgxcGroupRemove(DropGroupStmt *stmt)
+{
+ Relation relation;
+ HeapTuple tup;
+ const char *group_name = stmt->group_name;
+ Oid group_oid = get_pgxc_groupoid(group_name);
+
+ /* Only a DB administrator can remove cluster node groups */
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to remove cluster node groups")));
+
+ /* Check if group exists */
+ if (!OidIsValid(group_oid))
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_OBJECT),
+ errmsg("PGXC Group %s: group not defined",
+ group_name)));
+
+ /* Delete the pgxc_group tuple */
+ relation = heap_open(PgxcGroupRelationId, RowExclusiveLock);
+ tup = SearchSysCache(PGXCGROUPOID, ObjectIdGetDatum(group_oid), 0, 0, 0);
+
+ if (!HeapTupleIsValid(tup)) /* should not happen */
+ elog(ERROR, "PGXC Group %s: group not defined", group_name);
+
+ simple_heap_delete(relation, &tup->t_self);
+
+ ReleaseSysCache(tup);
+
+ heap_close(relation, RowExclusiveLock);
+}
--- /dev/null
- /* Insert tuple in catalog */
- simple_heap_insert(pgxcnodesrel, htup);
-
- CatalogUpdateIndexes(pgxcnodesrel, htup);
+/*-------------------------------------------------------------------------
+ *
+ * nodemgr.c
+ * Routines to support manipulation of the pgxc_node catalog
+ * Support concerns CREATE/ALTER/DROP on NODE object.
+ *
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "access/hash.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "catalog/catalog.h"
+#include "catalog/indexing.h"
+#include "catalog/pgxc_node.h"
+#include "commands/defrem.h"
+#include "nodes/parsenodes.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/tqual.h"
+#include "pgxc/locator.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
++#include "storage/lwlock.h"
++#include "storage/shmem.h"
+
+/*
+ * How many times should we try to find a unique indetifier
+ * in case hash of the node name comes out to be duplicate
+ */
+
+#define MAX_TRIES_FOR_NID 200
+
+static Datum generate_node_id(const char *node_name);
+static void count_coords_datanodes(Relation rel, int *num_coord, int *num_dns);
+
+/*
+ * GUC parameters.
+ * Shared memory block can not be resized dynamically, so we should have some
+ * limits set at startup time to calculate amount of shared memory to store
+ * node table. Nodes can be added to running cluster until that limit is reached
+ * if cluster needs grow beyond the configuration value should be changed and
+ * cluster restarted.
+ */
+int MaxCoords = 16;
+int MaxDataNodes = 16;
+
+/* Global number of nodes. Point to a shared memory block */
+static int *shmemNumCoords;
+static int *shmemNumDataNodes;
+
+/* Shared memory tables of node definitions */
+NodeDefinition *coDefs;
+NodeDefinition *dnDefs;
+
+/*
+ * NodeTablesInit
+ * Initializes shared memory tables of Coordinators and Datanodes.
+ */
+void
+NodeTablesShmemInit(void)
+{
+ bool found;
+ int i;
+
+ /*
+ * Initialize the table of Coordinators: first sizeof(int) bytes are to
+ * store actual number of Coordinators, remaining data in the structure is
+ * array of NodeDefinition that can contain up to MaxCoords entries.
+ * That is a bit weird and probably it would be better have these in
+ * separate structures, but I am unsure about cost of having shmem structure
+ * containing just single integer.
+ */
+ shmemNumCoords = ShmemInitStruct("Coordinator Table",
+ sizeof(int) +
+ sizeof(NodeDefinition) * MaxCoords,
+ &found);
+
+ /* Have coDefs pointing right behind shmemNumCoords */
+ coDefs = (NodeDefinition *) (shmemNumCoords + 1);
+
+ /* Mark it empty upon creation */
+ if (!found)
+ {
+ *shmemNumCoords = 0;
+ /* Mark nodeishealthy true at init time for all */
+ for (i = 0; i < MaxCoords; i++)
+ coDefs[i].nodeishealthy = true;
+ }
+
+ /* Same for Datanodes */
+ shmemNumDataNodes = ShmemInitStruct("Datanode Table",
+ sizeof(int) +
+ sizeof(NodeDefinition) * MaxDataNodes,
+ &found);
+
+ /* Have dnDefs pointing right behind shmemNumDataNodes */
+ dnDefs = (NodeDefinition *) (shmemNumDataNodes + 1);
+
+ /* Mark it empty upon creation */
+ if (!found)
+ {
+ *shmemNumDataNodes = 0;
+ /* Mark nodeishealthy true at init time for all */
+ for (i = 0; i < MaxDataNodes; i++)
+ dnDefs[i].nodeishealthy = true;
+ }
+}
+
+
+/*
+ * NodeTablesShmemSize
+ * Get the size of shared memory dedicated to node definitions
+ */
+Size
+NodeTablesShmemSize(void)
+{
+ Size co_size;
+ Size dn_size;
+
+ co_size = mul_size(sizeof(NodeDefinition), MaxCoords);
+ co_size = add_size(co_size, sizeof(int));
+ dn_size = mul_size(sizeof(NodeDefinition), MaxDataNodes);
+ dn_size = add_size(dn_size, sizeof(int));
+
+ return add_size(co_size, dn_size);
+}
+
+/*
+ * Check list of options and return things filled.
+ * This includes check on option values.
+ */
+static void
+check_node_options(const char *node_name, List *options, char **node_host,
+ int *node_port, char *node_type,
+ bool *is_primary, bool *is_preferred)
+{
+ ListCell *option;
+
+ if (!options)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("No options specified")));
+
+ /* Filter options */
+ foreach(option, options)
+ {
+ DefElem *defel = (DefElem *) lfirst(option);
+
+ if (strcmp(defel->defname, "port") == 0)
+ {
+ *node_port = defGetTypeLength(defel);
+
+ if (*node_port < 1 || *node_port > 65535)
+ ereport(ERROR,
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ errmsg("port value is out of range")));
+ }
+ else if (strcmp(defel->defname, "host") == 0)
+ {
+ *node_host = defGetString(defel);
+ }
+ else if (strcmp(defel->defname, "type") == 0)
+ {
+ char *type_loc;
+
+ type_loc = defGetString(defel);
+
+ if (strcmp(type_loc, "coordinator") != 0 &&
+ strcmp(type_loc, "datanode") != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("type value is incorrect, specify 'coordinator or 'datanode'")));
+
+ if (strcmp(type_loc, "coordinator") == 0)
+ *node_type = PGXC_NODE_COORDINATOR;
+ else
+ *node_type = PGXC_NODE_DATANODE;
+ }
+ else if (strcmp(defel->defname, "primary") == 0)
+ {
+ *is_primary = defGetBoolean(defel);
+ }
+ else if (strcmp(defel->defname, "preferred") == 0)
+ {
+ *is_preferred = defGetBoolean(defel);
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("incorrect option: %s", defel->defname)));
+ }
+ }
+
+ /* A primary node has to be a Datanode */
+ if (*is_primary && *node_type != PGXC_NODE_DATANODE)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC node %s: cannot be a primary node, it has to be a Datanode",
+ node_name)));
+
+ /* A preferred node has to be a Datanode */
+ if (*is_preferred && *node_type != PGXC_NODE_DATANODE)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC node %s: cannot be a preferred node, it has to be a Datanode",
+ node_name)));
+
+ /* Node type check */
+ if (*node_type == PGXC_NODE_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC node %s: Node type not specified",
+ node_name)));
+
+#ifdef XCP
+ if (*node_type == PGXC_NODE_DATANODE && NumDataNodes >= MaxDataNodes)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("Too many datanodes, current value of max_datanodes is %d",
+ MaxDataNodes)));
+
+#endif
+}
+
+/*
+ * generate_node_id
+ *
+ * Given a node name compute its hash to generate the identifier
+ * If the hash comes out to be duplicate , try some other values
+ * Give up after a few tries
+ */
+static Datum
+generate_node_id(const char *node_name)
+{
+ Datum node_id;
+ uint32 n;
+ bool inc;
+ int i;
+
+ /* Compute node identifier by computing hash of node name */
+ node_id = hash_any((unsigned char *)node_name, strlen(node_name));
+
+ /*
+ * Check if the hash is near the overflow limit, then we will
+ * decrement it , otherwise we will increment
+ */
+ inc = true;
+ n = DatumGetUInt32(node_id);
+ if (n >= UINT_MAX - MAX_TRIES_FOR_NID)
+ inc = false;
+
+ /*
+ * Check if the identifier is clashing with an existing one,
+ * and if it is try some other
+ */
+ for (i = 0; i < MAX_TRIES_FOR_NID; i++)
+ {
+ HeapTuple tup;
+
+ tup = SearchSysCache1(PGXCNODEIDENTIFIER, node_id);
+ if (tup == NULL)
+ break;
+
+ ReleaseSysCache(tup);
+
+ n = DatumGetUInt32(node_id);
+ if (inc)
+ n++;
+ else
+ n--;
+
+ node_id = UInt32GetDatum(n);
+ }
+
+ /*
+ * This has really few chances to happen, but inform backend that node
+ * has not been registered correctly in this case.
+ */
+ if (i >= MAX_TRIES_FOR_NID)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("Please choose different node name."),
+ errdetail("Name \"%s\" produces a duplicate identifier node_name",
+ node_name)));
+
+ return node_id;
+}
+
+/* --------------------------------
+ * cmp_nodes
+ *
+ * Compare the Oids of two XC nodes
+ * to sort them in ascending order by their names
+ * --------------------------------
+ */
+static int
+cmp_nodes(const void *p1, const void *p2)
+{
+ Oid n1 = *((Oid *)p1);
+ Oid n2 = *((Oid *)p2);
+
+ if (strcmp(get_pgxc_nodename(n1), get_pgxc_nodename(n2)) < 0)
+ return -1;
+
+ if (strcmp(get_pgxc_nodename(n1), get_pgxc_nodename(n2)) == 0)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Count the number of coordinators and datanodes configured so far.
+ */
+static void
+count_coords_datanodes(Relation rel, int *num_coord, int *num_dns)
+{
+ int coordCount = 0, dnCount = 0;
+ HeapScanDesc scan;
+ HeapTuple tuple;
+
+ scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ Form_pgxc_node nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+
+ /* Take definition for given node type */
+ switch (nodeForm->node_type)
+ {
+ case PGXC_NODE_COORDINATOR:
+ coordCount++;
+ break;
+ case PGXC_NODE_DATANODE:
+ dnCount++;
+ break;
+ default:
+ break;
+ }
+ }
+ heap_endscan(scan);
+
+ *num_coord = coordCount;
+ *num_dns = dnCount;
+}
+
+/*
+ * PgxcNodeListAndCount
+ *
+ * Update node definitions in the shared memory tables from the catalog
+ */
+void
+PgxcNodeListAndCount(void)
+{
+ Relation rel;
+ HeapScanDesc scan;
+ HeapTuple tuple;
+ NodeDefinition *nodes = NULL;
+ int numNodes;
+
+ LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
+
+ numNodes = *shmemNumCoords + *shmemNumDataNodes;
+
+ Assert((*shmemNumCoords >= 0) && (*shmemNumDataNodes >= 0));
+
+ /*
+ * Save the existing health status values because nodes
+ * might get added or deleted here. We will save
+ * nodeoid, status. No need to differentiate between
+ * coords and datanodes since oids will be unique anyways
+ */
+ if (numNodes > 0)
+ {
+ nodes = (NodeDefinition*)palloc(numNodes * sizeof(NodeDefinition));
+
+ /* XXX It's possible to call memcpy with */
+ if (*shmemNumCoords > 0)
+ memcpy(nodes, coDefs, *shmemNumCoords * sizeof(NodeDefinition));
+
+ if (*shmemNumDataNodes > 0)
+ memcpy(nodes + *shmemNumCoords, dnDefs,
+ *shmemNumDataNodes * sizeof(NodeDefinition));
+ }
+
+ *shmemNumCoords = 0;
+ *shmemNumDataNodes = 0;
+
+ /*
+ * Node information initialization is made in one scan:
+ * 1) Scan pgxc_node catalog to find the number of nodes for
+ * each node type and make proper allocations
+ * 2) Then extract the node Oid
+ * 3) Complete primary/preferred node information
+ */
+ rel = heap_open(PgxcNodeRelationId, AccessShareLock);
+ scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ Form_pgxc_node nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ NodeDefinition *node;
+ int i;
+
+ /* Take definition for given node type */
+ switch (nodeForm->node_type)
+ {
+ case PGXC_NODE_COORDINATOR:
+ node = &coDefs[(*shmemNumCoords)++];
+ break;
+ case PGXC_NODE_DATANODE:
+ default:
+ node = &dnDefs[(*shmemNumDataNodes)++];
+ break;
+ }
+
+ /* Populate the definition */
+ node->nodeoid = HeapTupleGetOid(tuple);
+ memcpy(&node->nodename, &nodeForm->node_name, NAMEDATALEN);
+ memcpy(&node->nodehost, &nodeForm->node_host, NAMEDATALEN);
+ node->nodeport = nodeForm->node_port;
+ node->nodeisprimary = nodeForm->nodeis_primary;
+ node->nodeispreferred = nodeForm->nodeis_preferred;
+ /*
+ * Copy over the health status from above for nodes that
+ * existed before and after the refresh. If we do not find
+ * entry for a nodeoid, we mark it as healthy
+ */
+ node->nodeishealthy = true;
+ for (i = 0; i < numNodes; i++)
+ {
+ if (nodes[i].nodeoid == node->nodeoid)
+ {
+ node->nodeishealthy = nodes[i].nodeishealthy;
+ break;
+ }
+ }
+ }
+ heap_endscan(scan);
+ heap_close(rel, AccessShareLock);
+
+ elog(DEBUG1, "Done pgxc_nodes scan: %d coordinators and %d datanodes",
+ *shmemNumCoords, *shmemNumDataNodes);
+
+ if (numNodes)
+ pfree(nodes);
+
+ /* Finally sort the lists */
+ if (*shmemNumCoords > 1)
+ qsort(coDefs, *shmemNumCoords, sizeof(NodeDefinition), cmp_nodes);
+ if (*shmemNumDataNodes > 1)
+ qsort(dnDefs, *shmemNumDataNodes, sizeof(NodeDefinition), cmp_nodes);
+
+ LWLockRelease(NodeTableLock);
+}
+
+
+/*
+ * PgxcNodeGetIds
+ *
+ * List into palloc'ed arrays Oids of Coordinators and Datanodes currently
+ * presented in the node table, as well as number of Coordinators and Datanodes.
+ * Any parameter may be NULL if caller is not interested in receiving
+ * appropriate results. Preferred and primary node information can be updated
+ * in session if requested.
+ */
+void
+PgxcNodeGetOids(Oid **coOids, Oid **dnOids,
+ int *num_coords, int *num_dns, bool update_preferred)
+{
+ LWLockAcquire(NodeTableLock, LW_SHARED);
+
+ elog(DEBUG1, "Get OIDs from table: %d coordinators and %d datanodes",
+ *shmemNumCoords, *shmemNumDataNodes);
+
+ if (num_coords)
+ *num_coords = *shmemNumCoords;
+ if (num_dns)
+ *num_dns = *shmemNumDataNodes;
+
+ if (coOids)
+ {
+ int i;
+
+ *coOids = (Oid *) palloc(*shmemNumCoords * sizeof(Oid));
+ for (i = 0; i < *shmemNumCoords; i++)
+ (*coOids)[i] = coDefs[i].nodeoid;
+ }
+
+ if (dnOids)
+ {
+ int i;
+
+ *dnOids = (Oid *) palloc(*shmemNumDataNodes * sizeof(Oid));
+ for (i = 0; i < *shmemNumDataNodes; i++)
+ (*dnOids)[i] = dnDefs[i].nodeoid;
+ }
+
+ /* Update also preferred and primary node informations if requested */
+ if (update_preferred)
+ {
+ int i;
+
+ /* Initialize primary and preferred node information */
+ primary_data_node = InvalidOid;
+ num_preferred_data_nodes = 0;
+
+ for (i = 0; i < *shmemNumDataNodes; i++)
+ {
+ if (dnDefs[i].nodeisprimary)
+ primary_data_node = dnDefs[i].nodeoid;
+
+ if (dnDefs[i].nodeispreferred)
+ {
+ preferred_data_node[num_preferred_data_nodes] = dnDefs[i].nodeoid;
+ num_preferred_data_nodes++;
+ }
+ }
+ }
+
+ LWLockRelease(NodeTableLock);
+}
+
+/*
+ * PgxcNodeGetHealthMap
+ *
+ * List into palloc'ed arrays Oids of Coordinators and Datanodes currently
+ * presented in the node table, as well as number of Coordinators and Datanodes.
+ * Any parameter may be NULL if caller is not interested in receiving
+ * appropriate results for either the Coordinators or Datanodes.
+ */
+void
+PgxcNodeGetHealthMap(Oid *coOids, Oid *dnOids,
+ int *num_coords, int *num_dns, bool *coHealthMap,
+ bool *dnHealthMap)
+{
+ elog(DEBUG1, "Get HealthMap from table: %d coordinators and %d datanodes",
+ *shmemNumCoords, *shmemNumDataNodes);
+
+ LWLockAcquire(NodeTableLock, LW_SHARED);
+
+ if (num_coords)
+ *num_coords = *shmemNumCoords;
+ if (num_dns)
+ *num_dns = *shmemNumDataNodes;
+
+ if (coOids)
+ {
+ int i;
+ for (i = 0; i < *shmemNumCoords; i++)
+ {
+ coOids[i] = coDefs[i].nodeoid;
+ if (coHealthMap)
+ coHealthMap[i] = coDefs[i].nodeishealthy;
+ }
+ }
+
+ if (dnOids)
+ {
+ int i;
+
+ for (i = 0; i < *shmemNumDataNodes; i++)
+ {
+ dnOids[i] = dnDefs[i].nodeoid;
+ if (dnHealthMap)
+ dnHealthMap[i] = dnDefs[i].nodeishealthy;
+ }
+ }
+
+ LWLockRelease(NodeTableLock);
+}
+
+/*
+ * Consult the shared memory NodeDefinition structures and
+ * fetch the nodeishealthy value and return it back
+ *
+ * We will probably need a similar function for coordinators
+ * in the future..
+ */
+void
+PgxcNodeDnListHealth(List *nodeList, bool *healthmap)
+{
+ ListCell *lc;
+ int index = 0;
+
+ elog(DEBUG1, "Get healthmap from datanodeList");
+
+ if (!nodeList || !list_length(nodeList))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("NIL or empty nodeList passed")));
+
+ LWLockAcquire(NodeTableLock, LW_SHARED);
+ foreach(lc, nodeList)
+ {
+ int node = lfirst_int(lc);
+
+ if (node >= *shmemNumDataNodes)
+ {
+ LWLockRelease(NodeTableLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC health status not found for datanode with oid (%d)",
+ node)));
+ }
+ healthmap[index++] = dnDefs[node].nodeishealthy;
+ }
+ LWLockRelease(NodeTableLock);
+}
+
+/*
+ * Find node definition in the shared memory node table.
+ * The structure is a copy palloc'ed in current memory context.
+ */
+NodeDefinition *
+PgxcNodeGetDefinition(Oid node)
+{
+ NodeDefinition *result = NULL;
+ int i;
+
+ LWLockAcquire(NodeTableLock, LW_SHARED);
+
+ /* search through the Datanodes first */
+ for (i = 0; i < *shmemNumDataNodes; i++)
+ {
+ if (dnDefs[i].nodeoid == node)
+ {
+ result = (NodeDefinition *) palloc(sizeof(NodeDefinition));
+
+ memcpy(result, dnDefs + i, sizeof(NodeDefinition));
+
+ LWLockRelease(NodeTableLock);
+
+ return result;
+ }
+ }
+
+ /* if not found, search through the Coordinators */
+ for (i = 0; i < *shmemNumCoords; i++)
+ {
+ if (coDefs[i].nodeoid == node)
+ {
+ result = (NodeDefinition *) palloc(sizeof(NodeDefinition));
+
+ memcpy(result, coDefs + i, sizeof(NodeDefinition));
+
+ LWLockRelease(NodeTableLock);
+
+ return result;
+ }
+ }
+
+ /* not found, return NULL */
+ LWLockRelease(NodeTableLock);
+ return NULL;
+}
+
+/*
+ * Update health status of a node in the shared memory node table.
+ *
+ * We could try to optimize this by checking if the ishealthy value
+ * is already the same as the passed in one.. but if the cluster is
+ * impaired, dunno how much such optimizations are worth. So keeping
+ * it simple for now
+ */
+bool
+PgxcNodeUpdateHealth(Oid node, bool status)
+{
+ int i;
+
+ LWLockAcquire(NodeTableLock, LW_EXCLUSIVE);
+
+ /* search through the Datanodes first */
+ for (i = 0; i < *shmemNumDataNodes; i++)
+ {
+ if (dnDefs[i].nodeoid == node)
+ {
+ dnDefs[i].nodeishealthy = status;
+
+ LWLockRelease(NodeTableLock);
+
+ return true;
+ }
+ }
+
+ /* if not found, search through the Coordinators */
+ for (i = 0; i < *shmemNumCoords; i++)
+ {
+ if (coDefs[i].nodeoid == node)
+ {
+ coDefs[i].nodeishealthy = status;
+
+ LWLockRelease(NodeTableLock);
+
+ return true;
+ }
+ }
+
+ /* not found, return false */
+ LWLockRelease(NodeTableLock);
+ return false;
+}
+
+/*
+ * PgxcNodeCreate
+ *
+ * Add a PGXC node
+ */
+void
+PgxcNodeCreate(CreateNodeStmt *stmt)
+{
+ Relation pgxcnodesrel;
+ HeapTuple htup;
+ bool nulls[Natts_pgxc_node];
+ Datum values[Natts_pgxc_node];
+ const char *node_name = stmt->node_name;
+ int i;
+ /* Options with default values */
+ char *node_host = NULL;
+ char node_type = PGXC_NODE_NONE;
+ int node_port = 0;
+ bool is_primary = false;
+ bool is_preferred = false;
+ Datum node_id;
+ int coordCount = 0, dnCount = 0;
+
+ /* Only a DB administrator can add nodes */
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to create cluster nodes")));
+
+ /* Check that node name is node in use */
+ if (OidIsValid(get_pgxc_nodeoid(node_name)))
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_OBJECT),
+ errmsg("PGXC Node %s: object already defined",
+ node_name)));
+
+ /* Check length of node name */
+ if (strlen(node_name) > PGXC_NODENAME_LENGTH)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("Node name \"%s\" is too long",
+ node_name)));
+
+ /* Filter options */
+ check_node_options(node_name, stmt->options, &node_host,
+ &node_port, &node_type,
+ &is_primary, &is_preferred);
+
+ /* Compute node identifier */
+ node_id = generate_node_id(node_name);
+
+ /*
+ * Check that this node is not created as a primary if one already
+ * exists.
+ */
+ if (is_primary && OidIsValid(primary_data_node))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC node %s: two nodes cannot be primary",
+ node_name)));
+
+ /*
+ * Then assign default values if necessary
+ * First for port.
+ */
+ if (node_port == 0)
+ {
+ node_port = 5432;
+ elog(DEBUG1, "PGXC node %s: Applying default port value: %d",
+ node_name, node_port);
+ }
+
+ /* Then apply default value for host */
+ if (!node_host)
+ {
+ node_host = strdup("localhost");
+ elog(DEBUG1, "PGXC node %s: Applying default host value: %s",
+ node_name, node_host);
+ }
+
+ /* Iterate through all attributes initializing nulls and values */
+ for (i = 0; i < Natts_pgxc_node; i++)
+ {
+ nulls[i] = false;
+ values[i] = (Datum) 0;
+ }
+
+ /*
+ * Open the relation for insertion
+ * This is necessary to generate a unique Oid for the new node
+ * There could be a relation race here if a similar Oid
+ * being created before the heap is inserted.
+ */
+ pgxcnodesrel = heap_open(PgxcNodeRelationId, AccessExclusiveLock);
+
+ /*
+ * Get the count of datanodes and coordinators added so far and make sure
+ * we're not exceeding the configured limits
+ *
+ * XXX This is not full proof because someone may first set
+ * max_coordinators or max_datanodes to a high value, add nodes and then
+ * lower the value again.
+ */
+ count_coords_datanodes(pgxcnodesrel, &coordCount, &dnCount);
+
+ if ((node_type == PGXC_NODE_DATANODE && dnCount >= MaxDataNodes) ||
+ (node_type == PGXC_NODE_COORDINATOR && coordCount >= MaxCoords))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("cannot add more than %d %s",
+ node_type == PGXC_NODE_COORDINATOR ?
+ MaxCoords : MaxDataNodes,
+ node_type == PGXC_NODE_COORDINATOR ?
+ "coordinators" : "datanodes"),
+ errhint("increase the value of %s GUC and restart the cluster",
+ node_type == PGXC_NODE_COORDINATOR ?
+ "max_coordinators" : "max_datanodes"
+ )));
+
+ }
+
+ /* Build entry tuple */
+ values[Anum_pgxc_node_name - 1] = DirectFunctionCall1(namein, CStringGetDatum(node_name));
+ values[Anum_pgxc_node_type - 1] = CharGetDatum(node_type);
+ values[Anum_pgxc_node_port - 1] = Int32GetDatum(node_port);
+ values[Anum_pgxc_node_host - 1] = DirectFunctionCall1(namein, CStringGetDatum(node_host));
+ values[Anum_pgxc_node_is_primary - 1] = BoolGetDatum(is_primary);
+ values[Anum_pgxc_node_is_preferred - 1] = BoolGetDatum(is_preferred);
+ values[Anum_pgxc_node_id - 1] = node_id;
+
+ htup = heap_form_tuple(pgxcnodesrel->rd_att, values, nulls);
+
- simple_heap_update(rel, &oldtup->t_self, newtup);
-
- /* Update indexes */
- CatalogUpdateIndexes(rel, newtup);
++ CatalogTupleInsert(pgxcnodesrel, htup);
+
+ heap_close(pgxcnodesrel, AccessExclusiveLock);
+}
+
+/*
+ * PgxcNodeAlter
+ *
+ * Alter a PGXC node
+ */
+void
+PgxcNodeAlter(AlterNodeStmt *stmt)
+{
+ const char *node_name = stmt->node_name;
+ char *node_host;
+ char node_type;
+ int node_port;
+ bool is_preferred;
+ bool is_primary;
+ HeapTuple oldtup, newtup;
+ Oid nodeOid = get_pgxc_nodeoid(node_name);
+ Relation rel;
+ Datum new_record[Natts_pgxc_node];
+ bool new_record_nulls[Natts_pgxc_node];
+ bool new_record_repl[Natts_pgxc_node];
+ uint32 node_id;
+ int coordCount = 0, dnCount = 0;
+
+ /* Only a DB administrator can alter cluster nodes */
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to change cluster nodes")));
+
+ /* Look at the node tuple, and take exclusive lock on it */
+ rel = heap_open(PgxcNodeRelationId, AccessExclusiveLock);
+
+ /* Check that node exists */
+ if (!OidIsValid(nodeOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ node_name)));
+
+ /* Open new tuple, checks are performed on it and new values */
+ oldtup = SearchSysCacheCopy1(PGXCNODEOID, ObjectIdGetDatum(nodeOid));
+ if (!HeapTupleIsValid(oldtup))
+ elog(ERROR, "cache lookup failed for object %u", nodeOid);
+
+ /*
+ * check_options performs some internal checks on option values
+ * so set up values.
+ */
+ node_host = get_pgxc_nodehost(nodeOid);
+ node_port = get_pgxc_nodeport(nodeOid);
+ is_preferred = is_pgxc_nodepreferred(nodeOid);
+ is_primary = is_pgxc_nodeprimary(nodeOid);
+ node_type = get_pgxc_nodetype(nodeOid);
+ node_id = get_pgxc_node_id(nodeOid);
+
+ /* Filter options */
+ check_node_options(node_name, stmt->options, &node_host,
+ &node_port, &node_type,
+ &is_primary, &is_preferred);
+
+ /*
+ * Two nodes cannot be primary at the same time. If the primary
+ * node is this node itself, well there is no point in having an
+ * error.
+ */
+ if (is_primary &&
+ OidIsValid(primary_data_node) &&
+ nodeOid != primary_data_node)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC node %s: two nodes cannot be primary",
+ node_name)));
+
+ /*
+ * Get the count of datanodes and coordinators added so far and make sure
+ * we're not exceeding the configured limits
+ */
+ count_coords_datanodes(rel, &coordCount, &dnCount);
+
+ if ((node_type == PGXC_NODE_DATANODE && dnCount >= MaxDataNodes) ||
+ (node_type == PGXC_NODE_COORDINATOR && coordCount >= MaxCoords))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("cannot add more than %d %s",
+ node_type == PGXC_NODE_COORDINATOR ?
+ MaxCoords : MaxDataNodes,
+ node_type == PGXC_NODE_COORDINATOR ?
+ "coordinators" : "datanodes"),
+ errhint("increase the value of %s GUC and restart the cluster",
+ node_type == PGXC_NODE_COORDINATOR ?
+ "max_coordinators" : "max_datanodes"
+ )));
+
+ }
+
+ /* Update values for catalog entry */
+ MemSet(new_record, 0, sizeof(new_record));
+ MemSet(new_record_nulls, false, sizeof(new_record_nulls));
+ MemSet(new_record_repl, false, sizeof(new_record_repl));
+ new_record[Anum_pgxc_node_port - 1] = Int32GetDatum(node_port);
+ new_record_repl[Anum_pgxc_node_port - 1] = true;
+ new_record[Anum_pgxc_node_host - 1] =
+ DirectFunctionCall1(namein, CStringGetDatum(node_host));
+ new_record_repl[Anum_pgxc_node_host - 1] = true;
+ new_record[Anum_pgxc_node_type - 1] = CharGetDatum(node_type);
+ new_record_repl[Anum_pgxc_node_type - 1] = true;
+ new_record[Anum_pgxc_node_is_primary - 1] = BoolGetDatum(is_primary);
+ new_record_repl[Anum_pgxc_node_is_primary - 1] = true;
+ new_record[Anum_pgxc_node_is_preferred - 1] = BoolGetDatum(is_preferred);
+ new_record_repl[Anum_pgxc_node_is_preferred - 1] = true;
+ new_record[Anum_pgxc_node_id - 1] = UInt32GetDatum(node_id);
+ new_record_repl[Anum_pgxc_node_id - 1] = true;
+
+ /* Update relation */
+ newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel),
+ new_record,
+ new_record_nulls, new_record_repl);
++ CatalogTupleUpdate(rel, &oldtup->t_self, newtup);
+
+ /* Release lock at Commit */
+ heap_close(rel, NoLock);
+}
+
+
+/*
+ * PgxcNodeRemove
+ *
+ * Remove a PGXC node
+ */
+void
+PgxcNodeRemove(DropNodeStmt *stmt)
+{
+ Relation relation;
+ HeapTuple tup;
+ const char *node_name = stmt->node_name;
+ Oid noid = get_pgxc_nodeoid(node_name);
+
+ /* Only a DB administrator can remove cluster nodes */
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to remove cluster nodes")));
+
+ /* Check if node is defined */
+ if (!OidIsValid(noid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ node_name)));
+
+ if (strcmp(node_name, PGXCNodeName) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC Node %s: cannot drop local node",
+ node_name)));
+
+ /* PGXCTODO:
+ * Is there any group which has this node as member
+ * XC Tables will also have this as a member in their array
+ * Do this search in the local data structure.
+ * If a node is removed, it is necessary to check if there is a distributed
+ * table on it. If there are only replicated table it is OK.
+ * However, we have to be sure that there are no pooler agents in the cluster pointing to it.
+ */
+
+ /* Delete the pgxc_node tuple */
+ relation = heap_open(PgxcNodeRelationId, RowExclusiveLock);
+ tup = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(noid));
+ if (!HeapTupleIsValid(tup)) /* should not happen */
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ node_name)));
+
+ simple_heap_delete(relation, &tup->t_self);
+
+ ReleaseSysCache(tup);
+
+ heap_close(relation, RowExclusiveLock);
+}
--- /dev/null
- &isnull,
- NULL);
+/*-------------------------------------------------------------------------
+ *
+ * execRemote.c
+ *
+ * Functions to execute commands on remote Datanodes
+ *
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/pgxc/pool/execRemote.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <time.h>
+#include "postgres.h"
+#include "access/twophase.h"
+#include "access/gtm.h"
+#include "access/sysattr.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/relscan.h"
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "commands/prepare.h"
+#include "executor/executor.h"
+#include "gtm/gtm_c.h"
+#include "libpq/libpq.h"
+#include "miscadmin.h"
+#include "pgxc/execRemote.h"
+#include "tcop/tcopprot.h"
+#include "executor/nodeSubplan.h"
+#include "nodes/nodeFuncs.h"
+#include "pgstat.h"
+#include "nodes/nodes.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/var.h"
+#include "pgxc/copyops.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/poolmgr.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/pg_rusage.h"
+#include "utils/tuplesort.h"
+#include "utils/snapmgr.h"
+#include "utils/builtins.h"
+#include "pgxc/locator.h"
+#include "pgxc/pgxc.h"
+#include "parser/parse_type.h"
+#include "parser/parsetree.h"
+#include "pgxc/xc_maintenance_mode.h"
+
+/*
+ * We do not want it too long, when query is terminating abnormally we just
+ * want to read in already available data, if datanode connection will reach a
+ * consistent state after that, we will go normal clean up procedure: send down
+ * ABORT etc., if data node is not responding we will signal pooler to drop
+ * the connection.
+ * It is better to drop and recreate datanode connection then wait for several
+ * seconds while it being cleaned up when, for example, cancelling query.
+ */
+#define END_QUERY_TIMEOUT 1000
+
+/* Declarations used by guc.c */
+int PGXLRemoteFetchSize;
+
+typedef struct
+{
+ xact_callback function;
+ void *fparams;
+} abort_callback_type;
+
+/*
+ * Buffer size does not affect performance significantly, just do not allow
+ * connection buffer grows infinitely
+ */
+#define COPY_BUFFER_SIZE 8192
+#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024
+
+/*
+ * Flag to track if a temporary object is accessed by the current transaction
+ */
+static bool temp_object_included = false;
+static abort_callback_type dbcleanup_info = { NULL, NULL };
+
+static int pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections,
+ GlobalTransactionId gxid, bool need_tran_block,
+ bool readOnly, char node_type);
+
+static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate,
+ ExecNodes *exec_nodes,
+ RemoteQueryExecType exec_type,
+ bool is_global_session);
+
+
+static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection,
+ RemoteQueryState *remotestate, Snapshot snapshot);
+
+static void pgxc_node_remote_count(int *dnCount, int dnNodeIds[],
+ int *coordCount, int coordNodeIds[]);
+static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode);
+static bool pgxc_node_remote_finish(char *prepareGID, bool commit,
+ char *nodestring, GlobalTransactionId gxid,
+ GlobalTransactionId prepare_gxid);
+static void pgxc_node_remote_commit(void);
+static void pgxc_node_remote_abort(void);
+static void pgxc_connections_cleanup(ResponseCombiner *combiner);
+
+static void pgxc_node_report_error(ResponseCombiner *combiner);
+
+#define REMOVE_CURR_CONN(combiner) \
+ if ((combiner)->current_conn < --((combiner)->conn_count)) \
+ { \
+ (combiner)->connections[(combiner)->current_conn] = \
+ (combiner)->connections[(combiner)->conn_count]; \
+ } \
+ else \
+ (combiner)->current_conn = 0
+
+#define MAX_STATEMENTS_PER_TRAN 10
+
+/* Variables to collect statistics */
+static int total_transactions = 0;
+static int total_statements = 0;
+static int total_autocommit = 0;
+static int nonautocommit_2pc = 0;
+static int autocommit_2pc = 0;
+static int current_tran_statements = 0;
+static int *statements_per_transaction = NULL;
+static int *nodes_per_transaction = NULL;
+
+/*
+ * statistics collection: count a statement
+ */
+static void
+stat_statement()
+{
+ total_statements++;
+ current_tran_statements++;
+}
+
+/*
+ * To collect statistics: count a transaction
+ */
+static void
+stat_transaction(int node_count)
+{
+ total_transactions++;
+
+ if (!statements_per_transaction)
+ {
+ statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ }
+ if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
+ statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
+ else
+ statements_per_transaction[current_tran_statements]++;
+ current_tran_statements = 0;
+ if (node_count > 0 && node_count <= NumDataNodes)
+ {
+ if (!nodes_per_transaction)
+ {
+ nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
+ memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
+ }
+ nodes_per_transaction[node_count - 1]++;
+ }
+}
+
+
+/*
+ * Output collected statistics to the log
+ */
+static void
+stat_log()
+{
+ elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
+ elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
+ total_autocommit, autocommit_2pc, nonautocommit_2pc);
+ if (total_transactions)
+ {
+ if (statements_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
+ elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
+ i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
+ }
+ elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
+ MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
+ if (nodes_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < NumDataNodes; i++)
+ elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
+ i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
+ }
+ }
+}
+
+
+/*
+ * Create a structure to store parameters needed to combine responses from
+ * multiple connections as well as state information
+ */
+void
+InitResponseCombiner(ResponseCombiner *combiner, int node_count,
+ CombineType combine_type)
+{
+ combiner->node_count = node_count;
+ combiner->connections = NULL;
+ combiner->conn_count = 0;
+ combiner->combine_type = combine_type;
+ combiner->command_complete_count = 0;
+ combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
+ combiner->description_count = 0;
+ combiner->copy_in_count = 0;
+ combiner->copy_out_count = 0;
+ combiner->copy_file = NULL;
+ combiner->errorMessage = NULL;
+ combiner->errorDetail = NULL;
+ combiner->errorHint = NULL;
+ combiner->tuple_desc = NULL;
+ combiner->probing_primary = false;
+ combiner->returning_node = InvalidOid;
+ combiner->currentRow = NULL;
+ combiner->rowBuffer = NIL;
+ combiner->tapenodes = NULL;
+ combiner->merge_sort = false;
+ combiner->extended_query = false;
+ combiner->tapemarks = NULL;
+ combiner->tuplesortstate = NULL;
+ combiner->cursor = NULL;
+ combiner->update_cursor = NULL;
+ combiner->cursor_count = 0;
+ combiner->cursor_connections = NULL;
+ combiner->remoteCopyType = REMOTE_COPY_NONE;
+}
+
+
+/*
+ * Parse out row count from the command status response and convert it to integer
+ */
+static int
+parse_row_count(const char *message, size_t len, uint64 *rowcount)
+{
+ int digits = 0;
+ int pos;
+
+ *rowcount = 0;
+ /* skip \0 string terminator */
+ for (pos = 0; pos < len - 1; pos++)
+ {
+ if (message[pos] >= '0' && message[pos] <= '9')
+ {
+ *rowcount = *rowcount * 10 + message[pos] - '0';
+ digits++;
+ }
+ else
+ {
+ *rowcount = 0;
+ digits = 0;
+ }
+ }
+ return digits;
+}
+
+/*
+ * Convert RowDescription message to a TupleDesc
+ */
+static TupleDesc
+create_tuple_desc(char *msg_body, size_t len)
+{
+ TupleDesc result;
+ int i, nattr;
+ uint16 n16;
+
+ /* get number of attributes */
+ memcpy(&n16, msg_body, 2);
+ nattr = ntohs(n16);
+ msg_body += 2;
+
+ result = CreateTemplateTupleDesc(nattr, false);
+
+ /* decode attributes */
+ for (i = 1; i <= nattr; i++)
+ {
+ AttrNumber attnum;
+ char *attname;
+ char *typname;
+ Oid oidtypeid;
+ int32 typemode, typmod;
+
+ attnum = (AttrNumber) i;
+
+ /* attribute name */
+ attname = msg_body;
+ msg_body += strlen(attname) + 1;
+
+ /* type name */
+ typname = msg_body;
+ msg_body += strlen(typname) + 1;
+
+ /* table OID, ignored */
+ msg_body += 4;
+
+ /* column no, ignored */
+ msg_body += 2;
+
+ /* data type OID, ignored */
+ msg_body += 4;
+
+ /* type len, ignored */
+ msg_body += 2;
+
+ /* type mod */
+ memcpy(&typemode, msg_body, 4);
+ typmod = ntohl(typemode);
+ msg_body += 4;
+
+ /* PGXCTODO text/binary flag? */
+ msg_body += 2;
+
+ /* Get the OID type and mode type from typename */
+ parseTypeString(typname, &oidtypeid, NULL, false);
+
+ TupleDescInitEntry(result, attnum, attname, oidtypeid, typmod, 0);
+ }
+ return result;
+}
+
+/*
+ * Handle CopyOutCommandComplete ('c') message from a Datanode connection
+ */
+static void
+HandleCopyOutComplete(ResponseCombiner *combiner)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'c' message, current request type %d", combiner->request_type)));
+ /* Just do nothing, close message is managed by the Coordinator */
+ combiner->copy_out_count++;
+}
+
+/*
+ * Handle CommandComplete ('C') message from a Datanode connection
+ */
+static void
+HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+{
+ int digits = 0;
+ EState *estate = combiner->ss.ps.state;
+
+ /*
+ * If we did not receive description we are having rowcount or OK response
+ */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COMMAND;
+ /* Extract rowcount */
+ if (combiner->combine_type != COMBINE_TYPE_NONE && estate)
+ {
+ uint64 rowcount;
+ digits = parse_row_count(msg_body, len, &rowcount);
+ if (digits > 0)
+ {
+ /* Replicated write, make sure they are the same */
+ if (combiner->combine_type == COMBINE_TYPE_SAME)
+ {
+ if (combiner->command_complete_count)
+ {
+ /*
+ * Replicated command may succeed on on node and fail on
+ * another. The example is if distributed table referenced
+ * by a foreign key constraint defined on a partitioned
+ * table. If command deletes rows from the replicated table
+ * they may be referenced on one Datanode but not on other.
+ * So, replicated command on each Datanode either affects
+ * proper number of rows, or returns error. Here if
+ * combiner got an error already, we allow to report it,
+ * not the scaring data corruption message.
+ */
+ if (combiner->errorMessage == NULL && rowcount != estate->es_processed)
+ /* There is a consistency issue in the database with the replicated table */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Write to replicated table returned different results from the Datanodes")));
+ }
+ else
+ /* first result */
+ estate->es_processed = rowcount;
+ }
+ else
+ estate->es_processed += rowcount;
+ }
+ else
+ combiner->combine_type = COMBINE_TYPE_NONE;
+ }
+
+ /* If response checking is enable only then do further processing */
+ if (conn->ck_resp_rollback)
+ {
+ if (strcmp(msg_body, "ROLLBACK") == 0)
+ {
+ /*
+ * Subsequent clean up routine will be checking this flag
+ * to determine nodes where to send ROLLBACK PREPARED.
+ * On current node PREPARE has failed and the two-phase record
+ * does not exist, so clean this flag as if PREPARE was not sent
+ * to that node and avoid erroneous command.
+ */
+ conn->ck_resp_rollback = false;
+ /*
+ * Set the error, if none, to force throwing.
+ * If there is error already, it will be thrown anyway, do not add
+ * this potentially confusing message
+ */
+ if (combiner->errorMessage == NULL)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
+ combiner->errorMessage =
+ pstrdup("unexpected ROLLBACK from remote node");
+ MemoryContextSwitchTo(oldcontext);
+ /*
+ * ERRMSG_PRODUCER_ERROR
+ * Messages with this code are replaced by others, if they are
+ * received, so if node will send relevant error message that
+ * one will be replaced.
+ */
+ combiner->errorCode[0] = 'X';
+ combiner->errorCode[1] = 'X';
+ combiner->errorCode[2] = '0';
+ combiner->errorCode[3] = '1';
+ combiner->errorCode[4] = '0';
+ }
+ }
+ }
+ combiner->command_complete_count++;
+}
+
+/*
+ * Handle RowDescription ('T') message from a Datanode connection
+ */
+static bool
+HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return false;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_QUERY;
+ if (combiner->request_type != REQUEST_TYPE_QUERY)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type)));
+ }
+ /* Increment counter and check if it was first */
+ if (combiner->description_count++ == 0)
+ {
+ combiner->tuple_desc = create_tuple_desc(msg_body, len);
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ * Handle CopyInResponse ('G') message from a Datanode connection
+ */
+static void
+HandleCopyIn(ResponseCombiner *combiner)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_IN;
+ if (combiner->request_type != REQUEST_TYPE_COPY_IN)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'G' message, current request type %d", combiner->request_type)));
+ }
+ /*
+ * The normal PG code will output an G message when it runs in the
+ * Coordinator, so do not proxy message here, just count it.
+ */
+ combiner->copy_in_count++;
+}
+
+/*
+ * Handle CopyOutResponse ('H') message from a Datanode connection
+ */
+static void
+HandleCopyOut(ResponseCombiner *combiner)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_OUT;
+ if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'H' message, current request type %d", combiner->request_type)));
+ }
+ /*
+ * The normal PG code will output an H message when it runs in the
+ * Coordinator, so do not proxy message here, just count it.
+ */
+ combiner->copy_out_count++;
+}
+
+/*
+ * Handle CopyOutDataRow ('d') message from a Datanode connection
+ */
+static void
+HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len)
+{
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ combiner->request_type = REQUEST_TYPE_COPY_OUT;
+
+ /* Inconsistent responses */
+ if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the Datanodes for 'd' message, current request type %d", combiner->request_type)));
+
+ /* count the row */
+ combiner->processed++;
+
+ /* Output remote COPY operation to correct location */
+ switch (combiner->remoteCopyType)
+ {
+ case REMOTE_COPY_FILE:
+ /* Write data directly to file */
+ fwrite(msg_body, 1, len, combiner->copy_file);
+ break;
+ case REMOTE_COPY_STDOUT:
+ /* Send back data to client */
+ pq_putmessage('d', msg_body, len);
+ break;
+ case REMOTE_COPY_TUPLESTORE:
+ /*
+ * Do not store trailing \n character.
+ * When tuplestore data are loaded to a table it automatically
+ * inserts line ends.
+ */
+ tuplestore_putmessage(combiner->tuplestorestate, len-1, msg_body);
+ break;
+ case REMOTE_COPY_NONE:
+ default:
+ Assert(0); /* Should not happen */
+ }
+}
+
+/*
+ * Handle DataRow ('D') message from a Datanode connection
+ * The function returns true if data row is accepted and successfully stored
+ * within the combiner.
+ */
+static bool
+HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node)
+{
+ /* We expect previous message is consumed */
+ Assert(combiner->currentRow == NULL);
+
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return false;
+
+ if (combiner->request_type != REQUEST_TYPE_QUERY)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type)));
+ }
+
+ /*
+ * If we got an error already ignore incoming data rows from other nodes
+ * Still we want to continue reading until get CommandComplete
+ */
+ if (combiner->errorMessage)
+ return false;
+
+ /*
+ * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples
+ * from one node, skip others as duplicates
+ */
+ if (combiner->combine_type == COMBINE_TYPE_SAME)
+ {
+ /* Do not return rows when probing primary, instead return when doing
+ * first normal node. Just save some CPU and traffic in case if
+ * probing fails.
+ */
+ if (combiner->probing_primary)
+ return false;
+ if (OidIsValid(combiner->returning_node))
+ {
+ if (combiner->returning_node != node)
+ return false;
+ }
+ else
+ combiner->returning_node = node;
+ }
+
+ /*
+ * We are copying message because it points into connection buffer, and
+ * will be overwritten on next socket read
+ */
+ combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len);
+ memcpy(combiner->currentRow->msg, msg_body, len);
+ combiner->currentRow->msglen = len;
+ combiner->currentRow->msgnode = node;
+
+ return true;
+}
+
+/*
+ * Handle ErrorResponse ('E') message from a Datanode connection
+ */
+static void
+HandleError(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+{
+ /* parse error message */
+ char *code = NULL;
+ char *message = NULL;
+ char *detail = NULL;
+ char *hint = NULL;
+ int offset = 0;
+
+ /*
+ * Scan until point to terminating \0
+ */
+ while (offset + 1 < len)
+ {
+ /* pointer to the field message */
+ char *str = msg_body + offset + 1;
+
+ switch (msg_body[offset])
+ {
+ case 'C': /* code */
+ code = str;
+ break;
+ case 'M': /* message */
+ message = str;
+ break;
+ case 'D': /* details */
+ detail = str;
+ break;
+
+ case 'H': /* hint */
+ hint = str;
+ break;
+
+ /* Fields not yet in use */
+ case 'S': /* severity */
+ case 'R': /* routine */
+ case 'P': /* position string */
+ case 'p': /* position int */
+ case 'q': /* int query */
+ case 'W': /* where */
+ case 'F': /* file */
+ case 'L': /* line */
+ default:
+ break;
+ }
+
+ /* code, message and \0 */
+ offset += strlen(str) + 2;
+ }
+
+ /*
+ * We may have special handling for some errors, default handling is to
+ * throw out error with the same message. We can not ereport immediately
+ * because we should read from this and other connections until
+ * ReadyForQuery is received, so we just store the error message.
+ * If multiple connections return errors only first one is reported.
+ *
+ * The producer error may be hiding primary error, so if previously received
+ * error is a producer error allow it to be overwritten.
+ */
+ if (combiner->errorMessage == NULL ||
+ MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1],
+ combiner->errorCode[2], combiner->errorCode[3],
+ combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(ErrorContext);
+ combiner->errorMessage = pstrdup(message);
+ /* Error Code is exactly 5 significant bytes */
+ if (code)
+ memcpy(combiner->errorCode, code, 5);
+ if (detail)
+ combiner->errorDetail = pstrdup(detail);
+ if (hint)
+ combiner->errorHint = pstrdup(hint);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /*
+ * If the PREPARE TRANSACTION command fails for whatever reason, we don't
+ * want to send down ROLLBACK PREPARED to this node. Otherwise, it may end
+ * up rolling back an unrelated prepared transaction with the same GID as
+ * used by this transaction
+ */
+ if (conn->ck_resp_rollback)
+ conn->ck_resp_rollback = false;
+
+ /*
+ * If Datanode have sent ErrorResponse it will never send CommandComplete.
+ * Increment the counter to prevent endless waiting for it.
+ */
+ combiner->command_complete_count++;
+}
+
+/*
+ * HandleCmdComplete -
+ * combine deparsed sql statements execution results
+ *
+ * Input parameters:
+ * commandType is dml command type
+ * combineTag is used to combine the completion result
+ * msg_body is execution result needed to combine
+ * len is msg_body size
+ */
+void
+HandleCmdComplete(CmdType commandType, CombineTag *combine,
+ const char *msg_body, size_t len)
+{
+ int digits = 0;
+ uint64 originrowcount = 0;
+ uint64 rowcount = 0;
+ uint64 total = 0;
+
+ if (msg_body == NULL)
+ return;
+
+ /* if there's nothing in combine, just copy the msg_body */
+ if (strlen(combine->data) == 0)
+ {
+ strcpy(combine->data, msg_body);
+ combine->cmdType = commandType;
+ return;
+ }
+ else
+ {
+ /* commandType is conflict */
+ if (combine->cmdType != commandType)
+ return;
+
+ /* get the processed row number from msg_body */
+ digits = parse_row_count(msg_body, len + 1, &rowcount);
+ elog(DEBUG1, "digits is %d\n", digits);
+ Assert(digits >= 0);
+
+ /* no need to combine */
+ if (digits == 0)
+ return;
+
+ /* combine the processed row number */
+ parse_row_count(combine->data, strlen(combine->data) + 1, &originrowcount);
+ elog(DEBUG1, "originrowcount is %lu, rowcount is %lu\n", originrowcount, rowcount);
+ total = originrowcount + rowcount;
+
+ }
+
+ /* output command completion tag */
+ switch (commandType)
+ {
+ case CMD_SELECT:
+ strcpy(combine->data, "SELECT");
+ break;
+ case CMD_INSERT:
+ snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ "INSERT %u %lu", 0, total);
+ break;
+ case CMD_UPDATE:
+ snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ "UPDATE %lu", total);
+ break;
+ case CMD_DELETE:
+ snprintf(combine->data, COMPLETION_TAG_BUFSIZE,
+ "DELETE %lu", total);
+ break;
+ default:
+ strcpy(combine->data, "");
+ break;
+ }
+
+}
+
+/*
+ * HandleDatanodeCommandId ('M') message from a Datanode connection
+ */
+static void
+HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len)
+{
+ uint32 n32;
+ CommandId cid;
+
+ Assert(msg_body != NULL);
+ Assert(len >= 2);
+
+ /* Get the command Id */
+ memcpy(&n32, &msg_body[0], 4);
+ cid = ntohl(n32);
+
+ /* If received command Id is higher than current one, set it to a new value */
+ if (cid > GetReceivedCommandId())
+ SetReceivedCommandId(cid);
+}
+
+/*
+ * Record waited-for XIDs received from the remote nodes into the transaction
+ * state
+ */
+static void
+HandleWaitXids(char *msg_body, size_t len)
+{
+ int xid_count;
+ uint32 n32;
+ int cur;
+ int i;
+
+ /* Get the xid count */
+ xid_count = len / sizeof (TransactionId);
+
+ cur = 0;
+ for (i = 0; i < xid_count; i++)
+ {
+ Assert(cur < len);
+ memcpy(&n32, &msg_body[cur], sizeof (TransactionId));
+ cur = cur + sizeof (TransactionId);
+ TransactionRecordXidWait(ntohl(n32));
+ }
+}
+
+static void
+HandleGlobalTransactionId(char *msg_body, size_t len)
+{
+ GlobalTransactionId xid;
+
+ Assert(len == sizeof (GlobalTransactionId));
+ memcpy(&xid, &msg_body[0], sizeof (GlobalTransactionId));
+
+ SetTopTransactionId(xid);
+}
+
+/*
+ * Examine the specified combiner state and determine if command was completed
+ * successfully
+ */
+static bool
+validate_combiner(ResponseCombiner *combiner)
+{
+ /* There was error message while combining */
+ if (combiner->errorMessage)
+ return false;
+ /* Check if state is defined */
+ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
+ return false;
+
+ /* Check all nodes completed */
+ if ((combiner->request_type == REQUEST_TYPE_COMMAND
+ || combiner->request_type == REQUEST_TYPE_QUERY)
+ && combiner->command_complete_count != combiner->node_count)
+ return false;
+
+ /* Check count of description responses */
+ if (combiner->request_type == REQUEST_TYPE_QUERY
+ && combiner->description_count != combiner->node_count)
+ return false;
+
+ /* Check count of copy-in responses */
+ if (combiner->request_type == REQUEST_TYPE_COPY_IN
+ && combiner->copy_in_count != combiner->node_count)
+ return false;
+
+ /* Check count of copy-out responses */
+ if (combiner->request_type == REQUEST_TYPE_COPY_OUT
+ && combiner->copy_out_count != combiner->node_count)
+ return false;
+
+ /* Add other checks here as needed */
+
+ /* All is good if we are here */
+ return true;
+}
+
+/*
+ * Close combiner and free allocated memory, if it is not needed
+ */
+void
+CloseCombiner(ResponseCombiner *combiner)
+{
+ if (combiner->connections)
+ pfree(combiner->connections);
+ if (combiner->tuple_desc)
+ FreeTupleDesc(combiner->tuple_desc);
+ if (combiner->errorMessage)
+ pfree(combiner->errorMessage);
+ if (combiner->errorDetail)
+ pfree(combiner->errorDetail);
+ if (combiner->errorHint)
+ pfree(combiner->errorHint);
+ if (combiner->cursor_connections)
+ pfree(combiner->cursor_connections);
+ if (combiner->tapenodes)
+ pfree(combiner->tapenodes);
+ if (combiner->tapemarks)
+ pfree(combiner->tapemarks);
+}
+
+/*
+ * Validate combiner and release storage freeing allocated memory
+ */
+static bool
+ValidateAndCloseCombiner(ResponseCombiner *combiner)
+{
+ bool valid = validate_combiner(combiner);
+
+ CloseCombiner(combiner);
+
+ return valid;
+}
+
+/*
+ * It is possible if multiple steps share the same Datanode connection, when
+ * executor is running multi-step query or client is running multiple queries
+ * using Extended Query Protocol. After returning next tuple ExecRemoteQuery
+ * function passes execution control to the executor and then it can be given
+ * to the same RemoteQuery or to different one. It is possible that before
+ * returning a tuple the function do not read all Datanode responses. In this
+ * case pending responses should be read in context of original RemoteQueryState
+ * till ReadyForQuery message and data rows should be stored (buffered) to be
+ * available when fetch from that RemoteQueryState is requested again.
+ * BufferConnection function does the job.
+ * If a RemoteQuery is going to use connection it should check connection state.
+ * DN_CONNECTION_STATE_QUERY indicates query has data to read and combiner
+ * points to the original RemoteQueryState. If combiner differs from "this" the
+ * connection should be buffered.
+ */
+void
+BufferConnection(PGXCNodeHandle *conn)
+{
+ ResponseCombiner *combiner = conn->combiner;
+ MemoryContext oldcontext;
+
+ if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY)
+ return;
+
+ elog(DEBUG2, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor);
+
+ /*
+ * When BufferConnection is invoked CurrentContext is related to other
+ * portal, which is trying to control the connection.
+ * TODO See if we can find better context to switch to
+ */
+ oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt);
+
+ /* Verify the connection is in use by the combiner */
+ combiner->current_conn = 0;
+ while (combiner->current_conn < combiner->conn_count)
+ {
+ if (combiner->connections[combiner->current_conn] == conn)
+ break;
+ combiner->current_conn++;
+ }
+ Assert(combiner->current_conn < combiner->conn_count);
+
+ if (combiner->tapemarks == NULL)
+ combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*));
+
+ /*
+ * If current bookmark for the current tape is not set it means either
+ * first row in the buffer is from the current tape or no rows from
+ * the tape in the buffer, so if first row is not from current
+ * connection bookmark the last cell in the list.
+ */
+ if (combiner->tapemarks[combiner->current_conn] == NULL &&
+ list_length(combiner->rowBuffer) > 0)
+ {
+ RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ if (dataRow->msgnode != conn->nodeoid)
+ combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer);
+ }
+
+ /*
+ * Buffer data rows until data node return number of rows specified by the
+ * fetch_size parameter of last Execute message (PortalSuspended message)
+ * or end of result set is reached (CommandComplete message)
+ */
+ while (true)
+ {
+ int res;
+
+ /* Move to buffer currentRow (received from the data node) */
+ if (combiner->currentRow)
+ {
+ combiner->rowBuffer = lappend(combiner->rowBuffer,
+ combiner->currentRow);
+ combiner->currentRow = NULL;
+ }
+
+ res = handle_response(conn, combiner);
+ /*
+ * If response message is a DataRow it will be handled on the next
+ * iteration.
+ * PortalSuspended will cause connection state change and break the loop
+ * The same is for CommandComplete, but we need additional handling -
+ * remove connection from the list of active connections.
+ * We may need to add handling error response
+ */
+
+ /* Most often result check first */
+ if (res == RESPONSE_DATAROW)
+ {
+ /*
+ * The row is in the combiner->currentRow, on next iteration it will
+ * be moved to the buffer
+ */
+ continue;
+ }
+
+ /* incomplete message, read more */
+ if (res == RESPONSE_EOF)
+ {
+ if (pgxc_node_receive(1, &conn, NULL))
+ {
+ PGXCNodeSetConnectionState(conn,
+ DN_CONNECTION_STATE_ERROR_FATAL);
+ add_error_message(conn, "Failed to fetch from data node");
+ }
+ }
+
+ /*
+ * End of result set is reached, so either set the pointer to the
+ * connection to NULL (combiner with sort) or remove it from the list
+ * (combiner without sort)
+ */
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /*
+ * If combiner is doing merge sort we should set reference to the
+ * current connection to NULL in the array, indicating the end
+ * of the tape is reached. FetchTuple will try to access the buffer
+ * first anyway.
+ * Since we remove that reference we can not determine what node
+ * number was this connection, but we need this info to find proper
+ * tuple in the buffer if we are doing merge sort. So store node
+ * number in special array.
+ * NB: We can not test if combiner->tuplesortstate is set here:
+ * connection may require buffering inside tuplesort_begin_merge
+ * - while pre-read rows from the tapes, one of the tapes may be
+ * the local connection with RemoteSubplan in the tree. The
+ * combiner->tuplesortstate is set only after tuplesort_begin_merge
+ * returns.
+ */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ if (combiner->tapenodes == NULL)
+ combiner->tapenodes = (Oid *)
+ palloc0(combiner->conn_count * sizeof(Oid));
+ combiner->tapenodes[combiner->current_conn] = conn->nodeoid;
+ }
+ else
+ {
+ /* Remove current connection, move last in-place, adjust current_conn */
+ if (combiner->current_conn < --combiner->conn_count)
+ combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
+ else
+ combiner->current_conn = 0;
+ }
+ /*
+ * If combiner runs Simple Query Protocol we need to read in
+ * ReadyForQuery. In case of Extended Query Protocol it is not
+ * sent and we should quit.
+ */
+ if (combiner->extended_query)
+ break;
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ if (combiner->extended_query)
+ {
+ /*
+ * Need to sync connection to enable receiving commands
+ * by the datanode
+ */
+ if (pgxc_node_send_sync(conn) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to sync msg to node %u", conn->nodeoid)));
+ }
+ }
+ }
+ else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY)
+ {
+ /* Now it is OK to quit */
+ break;
+ }
+ }
+ Assert(conn->state != DN_CONNECTION_STATE_QUERY);
+ MemoryContextSwitchTo(oldcontext);
+ conn->combiner = NULL;
+}
+
+/*
+ * copy the datarow from combiner to the given slot, in the slot's memory
+ * context
+ */
+static void
+CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot)
+{
+ RemoteDataRow datarow;
+ MemoryContext oldcontext;
+ oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
+ datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen);
+ datarow->msgnode = combiner->currentRow->msgnode;
+ datarow->msglen = combiner->currentRow->msglen;
+ memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen);
+ ExecStoreDataRowTuple(datarow, slot, true);
+ pfree(combiner->currentRow);
+ combiner->currentRow = NULL;
+ MemoryContextSwitchTo(oldcontext);
+}
+
+
+/*
+ * FetchTuple
+ *
+ Get next tuple from one of the datanode connections.
+ * The connections should be in combiner->connections, if "local" dummy
+ * connection presents it should be the last active connection in the array.
+ * If combiner is set up to perform merge sort function returns tuple from
+ * connection defined by combiner->current_conn, or NULL slot if no more tuple
+ * are available from the connection. Otherwise it returns tuple from any
+ * connection or NULL slot if no more available connections.
+ * Function looks into combiner->rowBuffer before accessing connection
+ * and return a tuple from there if found.
+ * Function may wait while more data arrive from the data nodes. If there
+ * is a locally executed subplan function advance it and buffer resulting rows
+ * instead of waiting.
+ */
+TupleTableSlot *
+FetchTuple(ResponseCombiner *combiner)
+{
+ PGXCNodeHandle *conn;
+ TupleTableSlot *slot;
+ Oid nodeOid = -1;
+
+ /*
+ * Case if we run local subplan.
+ * We do not have remote connections, so just get local tuple and return it
+ */
+ if (outerPlanState(combiner))
+ {
+ RemoteSubplanState *planstate = (RemoteSubplanState *) combiner;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ /* Advance subplan in a loop until we have something to return */
+ for (;;)
+ {
+ Datum value = (Datum) 0;
+ bool isnull = false;
+ int numnodes;
+ int i;
+
+ slot = ExecProcNode(outerPlanState(combiner));
+ /* If locator is not defined deliver all the results */
+ if (planstate->locator == NULL)
+ return slot;
+
+ /*
+ * If NULL tuple is returned we done with the subplan, finish it up and
+ * return NULL
+ */
+ if (TupIsNull(slot))
+ return NULL;
+
+ /* Get partitioning value if defined */
+ if (plan->distributionKey != InvalidAttrNumber)
+ value = slot_getattr(slot, plan->distributionKey, &isnull);
+
+ /* Determine target nodes */
+ numnodes = GET_NODES(planstate->locator, value, isnull, NULL);
+ for (i = 0; i < numnodes; i++)
+ {
+ /* Deliver the node */
+ if (planstate->dest_nodes[i] == PGXCNodeId-1)
+ return slot;
+ }
+ }
+ }
+
+ /*
+ * Get current connection
+ */
+ if (combiner->conn_count > combiner->current_conn)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ conn = NULL;
+
+ /*
+ * If doing merge sort determine the node number.
+ * It may be needed to get buffered row.
+ */
+ if (combiner->merge_sort)
+ {
+ Assert(conn || combiner->tapenodes);
+ nodeOid = conn ? conn->nodeoid :
+ combiner->tapenodes[combiner->current_conn];
+ Assert(OidIsValid(nodeOid));
+ }
+
+ /*
+ * First look into the row buffer.
+ * When we are performing merge sort we need to get from the buffer record
+ * from the connection marked as "current". Otherwise get first.
+ */
+ if (list_length(combiner->rowBuffer) > 0)
+ {
+ RemoteDataRow dataRow;
+
+ Assert(combiner->currentRow == NULL);
+
+ if (combiner->merge_sort)
+ {
+ ListCell *lc;
+ ListCell *prev;
+
+ elog(DEBUG1, "Getting buffered tuple from node %x", nodeOid);
+
+ prev = combiner->tapemarks[combiner->current_conn];
+ if (prev)
+ {
+ /*
+ * Start looking through the list from the bookmark.
+ * Probably the first cell we check contains row from the needed
+ * node. Otherwise continue scanning until we encounter one,
+ * advancing prev pointer as well.
+ */
+ while((lc = lnext(prev)) != NULL)
+ {
+ dataRow = (RemoteDataRow) lfirst(lc);
+ if (dataRow->msgnode == nodeOid)
+ {
+ combiner->currentRow = dataRow;
+ break;
+ }
+ prev = lc;
+ }
+ }
+ else
+ {
+ /*
+ * Either needed row is the first in the buffer or no such row
+ */
+ lc = list_head(combiner->rowBuffer);
+ dataRow = (RemoteDataRow) lfirst(lc);
+ if (dataRow->msgnode == nodeOid)
+ combiner->currentRow = dataRow;
+ else
+ lc = NULL;
+ }
+ if (lc)
+ {
+ /*
+ * Delete cell from the buffer. Before we delete we must check
+ * the bookmarks, if the cell is a bookmark for any tape.
+ * If it is the case we are deleting last row of the current
+ * block from the current tape. That tape should have bookmark
+ * like current, and current bookmark will be advanced when we
+ * read the tape once again.
+ */
+ int i;
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ if (combiner->tapemarks[i] == lc)
+ combiner->tapemarks[i] = prev;
+ }
+ elog(DEBUG1, "Found buffered tuple from node %x", nodeOid);
+ combiner->rowBuffer = list_delete_cell(combiner->rowBuffer,
+ lc, prev);
+ }
+ elog(DEBUG1, "Update tapemark");
+ combiner->tapemarks[combiner->current_conn] = prev;
+ }
+ else
+ {
+ dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ combiner->currentRow = dataRow;
+ combiner->rowBuffer = list_delete_first(combiner->rowBuffer);
+ }
+ }
+
+ /* If we have node message in the currentRow slot, and it is from a proper
+ * node, consume it. */
+ if (combiner->currentRow)
+ {
+ Assert(!combiner->merge_sort ||
+ combiner->currentRow->msgnode == nodeOid);
+ slot = combiner->ss.ps.ps_ResultTupleSlot;
+ CopyDataRowTupleToSlot(combiner, slot);
+ return slot;
+ }
+
+ while (conn)
+ {
+ int res;
+
+ /* Going to use a connection, buffer it if needed */
+ CHECK_OWNERSHIP(conn, combiner);
+
+ /*
+ * If current connection is idle it means portal on the data node is
+ * suspended. Request more and try to get it
+ */
+ if (combiner->extended_query &&
+ conn->state == DN_CONNECTION_STATE_IDLE)
+ {
+ /*
+ * We do not allow to suspend if querying primary node, so that
+ * only may mean the current node is secondary and subplan was not
+ * executed there yet. Return and go on with second phase.
+ */
+ if (combiner->probing_primary)
+ {
+ return NULL;
+ }
+
+ if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (pgxc_node_send_flush(conn) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (pgxc_node_receive(1, &conn, NULL))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed receive data from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
+ }
+ }
+
+ /* read messages */
+ res = handle_response(conn, combiner);
+ if (res == RESPONSE_DATAROW)
+ {
+ slot = combiner->ss.ps.ps_ResultTupleSlot;
+ CopyDataRowTupleToSlot(combiner, slot);
+ return slot;
+ }
+ else if (res == RESPONSE_EOF)
+ {
+ /* incomplete message, read more */
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to receive more data from data node %u", conn->nodeoid)));
+ continue;
+ }
+ else if (res == RESPONSE_SUSPENDED)
+ {
+ /*
+ * If we are doing merge sort or probing primary node we should
+ * remain on the same node, so query next portion immediately.
+ * Otherwise leave node suspended and fetch lazily.
+ */
+ if (combiner->merge_sort || combiner->probing_primary)
+ {
+ if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ if (pgxc_node_send_flush(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed receive node from node %u cursor '%s'", conn->nodeoid, combiner->cursor)));
+ continue;
+ }
+
+ /*
+ * Tell the node to fetch data in background, next loop when we
+ * pgxc_node_receive, data is already there, so we can run faster
+ * */
+ if (pgxc_node_send_execute(conn, combiner->cursor, PGXLRemoteFetchSize) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send execute cursor '%s' to node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (pgxc_node_send_flush(conn) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed flush cursor '%s' node %u", combiner->cursor, conn->nodeoid)));
+ }
+
+ if (++combiner->current_conn >= combiner->conn_count)
+ combiner->current_conn = 0;
+ conn = combiner->connections[combiner->current_conn];
+ }
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /*
+ * In case of Simple Query Protocol we should receive ReadyForQuery
+ * before removing connection from the list. In case of Extended
+ * Query Protocol we may remove connection right away.
+ */
+ if (combiner->extended_query)
+ {
+ /* If we are doing merge sort clean current connection and return
+ * NULL, otherwise remove current connection, move last in-place,
+ * adjust current_conn and continue if it is not last connection */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ return NULL;
+ }
+ REMOVE_CURR_CONN(combiner);
+ if (combiner->conn_count > 0)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ return NULL;
+ }
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ /*
+ * If doing Extended Query Protocol we need to sync connection,
+ * otherwise subsequent commands will be ignored.
+ */
+ if (combiner->extended_query)
+ {
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to sync msg to node %u", conn->nodeoid)));
+ }
+ /*
+ * Do not wait for response from primary, it needs to wait
+ * for other nodes to respond. Instead go ahead and send query to
+ * other nodes. It will fail there, but we can continue with
+ * normal cleanup.
+ */
+ if (combiner->probing_primary)
+ {
+ REMOVE_CURR_CONN(combiner);
+ return NULL;
+ }
+ }
+ else if (res == RESPONSE_READY)
+ {
+ /* If we are doing merge sort clean current connection and return
+ * NULL, otherwise remove current connection, move last in-place,
+ * adjust current_conn and continue if it is not last connection */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ return NULL;
+ }
+ REMOVE_CURR_CONN(combiner);
+ if (combiner->conn_count > 0)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ return NULL;
+ }
+ else if (res == RESPONSE_TUPDESC)
+ {
+ ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot,
+ combiner->tuple_desc);
+ /* Now slot is responsible for freeng the descriptor */
+ combiner->tuple_desc = NULL;
+ }
+ else if (res == RESPONSE_ASSIGN_GXID)
+ {
+ /* Do nothing. It must have been handled in handle_response() */
+ }
+ else if (res == RESPONSE_WAITXIDS)
+ {
+ /* Do nothing. It must have been handled in handle_response() */
+ }
+ else
+ {
+ // Can not get here?
+ Assert(false);
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Handle responses from the Datanode connections
+ */
+static int
+pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
+ struct timeval * timeout, ResponseCombiner *combiner)
+{
+ int count = conn_count;
+ PGXCNodeHandle *to_receive[conn_count];
+
+ /* make a copy of the pointers to the connections */
+ memcpy(to_receive, connections, conn_count * sizeof(PGXCNodeHandle *));
+
+ /*
+ * Read results.
+ * Note we try and read from Datanode connections even if there is an error on one,
+ * so as to avoid reading incorrect results on the next statement.
+ * Other safegaurds exist to avoid this, however.
+ */
+ while (count > 0)
+ {
+ int i = 0;
+
+ if (pgxc_node_receive(count, to_receive, timeout))
+ return EOF;
+ while (i < count)
+ {
+ int result = handle_response(to_receive[i], combiner);
+ elog(DEBUG5, "Received response %d on connection to node %s",
+ result, to_receive[i]->nodename);
+ switch (result)
+ {
+ case RESPONSE_EOF: /* have something to read, keep receiving */
+ i++;
+ break;
+ case RESPONSE_COMPLETE:
+ if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL)
+ /* Continue read until ReadyForQuery */
+ break;
+ /* fallthru */
+ case RESPONSE_READY:
+ /* fallthru */
+ case RESPONSE_COPY:
+ /* Handling is done, do not track this connection */
+ count--;
+ /* Move last connection in place */
+ if (i < count)
+ to_receive[i] = to_receive[count];
+ break;
+ case RESPONSE_ERROR:
+ /* no handling needed, just wait for ReadyForQuery */
+ break;
+
+ case RESPONSE_WAITXIDS:
+ case RESPONSE_ASSIGN_GXID:
+ case RESPONSE_TUPDESC:
+ break;
+
+ case RESPONSE_DATAROW:
+ combiner->currentRow = NULL;
+ break;
+
+ default:
+ /* Inconsistent responses */
+ add_error_message(to_receive[i], "Unexpected response from the Datanodes");
+ elog(DEBUG1, "Unexpected response from the Datanodes, result = %d, request type %d", result, combiner->request_type);
+ /* Stop tracking and move last connection in place */
+ count--;
+ if (i < count)
+ to_receive[i] = to_receive[count];
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Read next message from the connection and update the combiner
+ * and connection state accordingly
+ * If we are in an error state we just consume the messages, and do not proxy
+ * Long term, we should look into cancelling executing statements
+ * and closing the connections.
+ * It returns if states need to be handled
+ * Return values:
+ * RESPONSE_EOF - need to receive more data for the connection
+ * RESPONSE_READY - got ReadyForQuery
+ * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
+ * Also this result is output in case of error
+ * RESPONSE_SUSPENDED - got PortalSuspended
+ * RESPONSE_TUPLEDESC - got tuple description
+ * RESPONSE_DATAROW - got data row
+ * RESPONSE_COPY - got copy response
+ * RESPONSE_BARRIER_OK - barrier command completed successfully
+ */
+int
+handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
+{
+ char *msg;
+ int msg_len;
+ char msg_type;
+
+ for (;;)
+ {
+ /*
+ * If we are in the process of shutting down, we
+ * may be rolling back, and the buffer may contain other messages.
+ * We want to avoid a procarray exception
+ * as well as an error stack overflow.
+ */
+ if (proc_exit_inprogress)
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+
+ /*
+ * Don't read from from the connection if there is a fatal error.
+ * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
+ * Handling of RESPONSE_ERROR assumes sending SYNC message, but
+ * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
+ * not usable.
+ */
+ if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ return RESPONSE_COMPLETE;
+
+ /* No data available, exit */
+ if (!HAS_MESSAGE_BUFFERED(conn))
+ return RESPONSE_EOF;
+
+ Assert(conn->combiner == combiner || conn->combiner == NULL);
+
+ /* TODO handle other possible responses */
+ msg_type = get_message(conn, &msg_len, &msg);
+ elog(DEBUG5, "handle_response - received message %c, node %s, "
+ "current_state %d", msg_type, conn->nodename, conn->state);
+ switch (msg_type)
+ {
+ case '\0': /* Not enough data in the buffer */
+ return RESPONSE_EOF;
+ case 'c': /* CopyToCommandComplete */
+ HandleCopyOutComplete(combiner);
+ break;
+ case 'C': /* CommandComplete */
+ HandleCommandComplete(combiner, msg, msg_len, conn);
+ conn->combiner = NULL;
+ /*
+ * In case of simple query protocol, wait for the ReadyForQuery
+ * before marking connection as Idle
+ */
+ if (combiner->extended_query &&
+ conn->state == DN_CONNECTION_STATE_QUERY)
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ return RESPONSE_COMPLETE;
+ case 'T': /* RowDescription */
+#ifdef DN_CONNECTION_DEBUG
+ Assert(!conn->have_row_desc);
+ conn->have_row_desc = true;
+#endif
+ if (HandleRowDescription(combiner, msg, msg_len))
+ return RESPONSE_TUPDESC;
+ break;
+ case 'D': /* DataRow */
+#ifdef DN_CONNECTION_DEBUG
+ Assert(conn->have_row_desc);
+#endif
+ /* Do not return if data row has not been actually handled */
+ if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid))
+ return RESPONSE_DATAROW;
+ break;
+ case 's': /* PortalSuspended */
+ /* No activity is expected on the connection until next query */
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ return RESPONSE_SUSPENDED;
+ case '1': /* ParseComplete */
+ case '2': /* BindComplete */
+ case '3': /* CloseComplete */
+ case 'n': /* NoData */
+ /* simple notifications, continue reading */
+ break;
+ case 'G': /* CopyInResponse */
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_IN);
+ HandleCopyIn(combiner);
+ /* Done, return to caller to let it know the data can be passed in */
+ return RESPONSE_COPY;
+ case 'H': /* CopyOutResponse */
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
+ HandleCopyOut(combiner);
+ return RESPONSE_COPY;
+ case 'd': /* CopyOutDataRow */
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
+ HandleCopyDataRow(combiner, msg, msg_len);
+ break;
+ case 'E': /* ErrorResponse */
+ HandleError(combiner, msg, msg_len, conn);
+ add_error_message(conn, combiner->errorMessage);
+ /*
+ * In case the remote node was running an extended query
+ * protocol and reported an error, it will keep ignoring all
+ * subsequent commands until it sees a SYNC message. So make
+ * sure that we send down SYNC even before sending a ROLLBACK
+ * command
+ */
+ if (conn->in_extended_query)
+ conn->needSync = true;
+ return RESPONSE_ERROR;
+ case 'A': /* NotificationResponse */
+ case 'N': /* NoticeResponse */
+ case 'S': /* SetCommandComplete */
+ /*
+ * Ignore these to prevent multiple messages, one from each
+ * node. Coordinator will send one for DDL anyway
+ */
+ break;
+ case 'Z': /* ReadyForQuery */
+ {
+ /*
+ * Return result depends on previous connection state.
+ * If it was PORTAL_SUSPENDED Coordinator want to send down
+ * another EXECUTE to fetch more rows, otherwise it is done
+ * with the connection
+ */
+ conn->transaction_status = msg[0];
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ conn->combiner = NULL;
+#ifdef DN_CONNECTION_DEBUG
+ conn->have_row_desc = false;
+#endif
+ return RESPONSE_READY;
+ }
+ case 'M': /* Command Id */
+ HandleDatanodeCommandId(combiner, msg, msg_len);
+ break;
+ case 'b':
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ return RESPONSE_BARRIER_OK;
+ case 'I': /* EmptyQuery */
+ return RESPONSE_COMPLETE;
+ case 'W':
+ HandleWaitXids(msg, msg_len);
+ return RESPONSE_WAITXIDS;
+ case 'x':
+ HandleGlobalTransactionId(msg, msg_len);
+ return RESPONSE_ASSIGN_GXID;
+ default:
+ /* sync lost? */
+ elog(WARNING, "Received unsupported message type: %c", msg_type);
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+ /* stop reading */
+ return RESPONSE_COMPLETE;
+ }
+ }
+ /* never happen, but keep compiler quiet */
+ return RESPONSE_EOF;
+}
+
+/*
+ * Has the data node sent Ready For Query
+ */
+
+bool
+is_data_node_ready(PGXCNodeHandle * conn)
+{
+ char *msg;
+ int msg_len;
+ char msg_type;
+
+ for (;;)
+ {
+ /*
+ * If we are in the process of shutting down, we
+ * may be rolling back, and the buffer may contain other messages.
+ * We want to avoid a procarray exception
+ * as well as an error stack overflow.
+ */
+ if (proc_exit_inprogress)
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+
+ /* don't read from from the connection if there is a fatal error */
+ if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ return true;
+
+ /* No data available, exit */
+ if (!HAS_MESSAGE_BUFFERED(conn))
+ return false;
+
+ msg_type = get_message(conn, &msg_len, &msg);
+ if (msg_type == 'Z')
+ {
+ /*
+ * Return result depends on previous connection state.
+ * If it was PORTAL_SUSPENDED Coordinator want to send down
+ * another EXECUTE to fetch more rows, otherwise it is done
+ * with the connection
+ */
+ conn->transaction_status = msg[0];
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+ conn->combiner = NULL;
+ return true;
+ }
+ }
+ /* never happen, but keep compiler quiet */
+ return false;
+}
+
+
+/*
+ * Send BEGIN command to the Datanodes or Coordinators and receive responses.
+ * Also send the GXID for the transaction.
+ */
+static int
+pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
+ GlobalTransactionId gxid, bool need_tran_block,
+ bool readOnly, char node_type)
+{
+ int i;
+ struct timeval *timeout = NULL;
+ ResponseCombiner combiner;
+ TimestampTz timestamp = GetCurrentGTMStartTimestamp();
+ PGXCNodeHandle *new_connections[conn_count];
+ int new_count = 0;
+ char *init_str;
+ char lxid[13];
+
+ /*
+ * If no remote connections, we don't have anything to do
+ */
+ if (conn_count == 0)
+ return 0;
+
+ for (i = 0; i < conn_count; i++)
+ {
+ if (!readOnly && !IsConnFromDatanode())
+ connections[i]->read_only = false;
+ /*
+ * PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY
+ * state when we are about to send a BEGIN TRANSACTION command to the
+ * node. We should consider changing the following to an assert and fix
+ * any bugs reported
+ */
+ if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
+ BufferConnection(connections[i]);
+
+ /* Send GXID and check for errors */
+ if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid))
+ return EOF;
+
+ /* Send timestamp and check for errors */
+ if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp))
+ return EOF;
+
+ if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid))
+ need_tran_block = true;
+ else if (IS_PGXC_REMOTE_COORDINATOR)
+ need_tran_block = false;
+
+ elog(DEBUG5, "need_tran_block %d, connections[%d]->transaction_status %c",
+ need_tran_block, i, connections[i]->transaction_status);
+ /* Send BEGIN if not already in transaction */
+ if (need_tran_block && connections[i]->transaction_status == 'I')
+ {
+ /* Send the BEGIN TRANSACTION command and check for errors */
+ if (pgxc_node_send_query(connections[i], "BEGIN"))
+ return EOF;
+
+ new_connections[new_count++] = connections[i];
+ }
+ }
+
+ /*
+ * If we did not send a BEGIN command to any node, we are done. Otherwise,
+ * we need to check for any errors and report them
+ */
+ if (new_count == 0)
+ return 0;
+
+ InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ /* Receive responses */
+ if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner))
+ return EOF;
+
+ /* Verify status */
+ if (!ValidateAndCloseCombiner(&combiner))
+ return EOF;
+
+ /* Send virtualXID to the remote nodes using SET command */
+ sprintf(lxid, "%d", MyProc->lxid);
+ PGXCNodeSetParam(true, "coordinator_lxid", lxid, 0);
+
+ /* after transactions are started send down local set commands */
+ init_str = PGXCNodeGetTransactionParamStr();
+ if (init_str)
+ {
+ for (i = 0; i < new_count; i++)
+ {
+ pgxc_node_set_query(new_connections[i], init_str);
+ }
+ }
+
+ /* No problem, let's get going */
+ return 0;
+}
+
+
+/*
+ * Execute DISCARD ALL command on all allocated nodes to remove all session
+ * specific stuff before releasing them to pool for reuse by other sessions.
+ */
+static void
+pgxc_node_remote_cleanup_all(void)
+{
+ PGXCNodeAllHandles *handles = get_current_handles();
+ PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
+ int new_conn_count = 0;
+ int i;
+ char *resetcmd = "RESET ALL;"
+ "RESET SESSION AUTHORIZATION;"
+ "RESET transaction_isolation;"
+ "RESET global_session";
+
+ elog(DEBUG5, "pgxc_node_remote_cleanup_all - handles->co_conn_count %d,"
+ "handles->dn_conn_count %d", handles->co_conn_count,
+ handles->dn_conn_count);
+ /*
+ * We must handle reader and writer connections both since even a read-only
+ * needs to be cleaned up.
+ */
+ if (handles->co_conn_count + handles->dn_conn_count == 0)
+ return;
+
+ /*
+ * Send down snapshot followed by DISCARD ALL command.
+ */
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = handles->coord_handles[i];
+
+ /* At this point connection should be in IDLE state */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ {
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ continue;
+ }
+
+ /*
+ * We must go ahead and release connections anyway, so do not throw
+ * an error if we have a problem here.
+ */
+ if (pgxc_node_send_query(handle, resetcmd))
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to clean up data nodes")));
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ continue;
+ }
+ new_connections[new_conn_count++] = handle;
+ handle->combiner = NULL;
+ }
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = handles->datanode_handles[i];
+
+ /* At this point connection should be in IDLE state */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ {
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ continue;
+ }
+
+ /*
+ * We must go ahead and release connections anyway, so do not throw
+ * an error if we have a problem here.
+ */
+ if (pgxc_node_send_query(handle, resetcmd))
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to clean up data nodes")));
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ continue;
+ }
+ new_connections[new_conn_count++] = handle;
+ handle->combiner = NULL;
+ }
+
+ if (new_conn_count)
+ {
+ ResponseCombiner combiner;
+ InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner);
+ CloseCombiner(&combiner);
+ }
+ pfree_pgxc_all_handles(handles);
+}
+
+/*
+ * Count how many coordinators and datanodes are involved in this transaction
+ * so that we can save that information in the GID
+ */
+static void
+pgxc_node_remote_count(int *dnCount, int dnNodeIds[],
+ int *coordCount, int coordNodeIds[])
+{
+ int i;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ *dnCount = *coordCount = 0;
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ if (!conn->read_only)
+ {
+ dnNodeIds[*dnCount] = conn->nodeid;
+ *dnCount = *dnCount + 1;
+ }
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ if (!conn->read_only)
+ {
+ coordNodeIds[*coordCount] = conn->nodeid;
+ *coordCount = *coordCount + 1;
+ }
+ }
+ }
+}
+
+/*
+ * Prepare nodes which ran write operations during the transaction.
+ * Read only remote transactions are committed and connections are released
+ * back to the pool.
+ * Function returns the list of nodes where transaction is prepared, including
+ * local node, if requested, in format expected by the GTM server.
+ * If something went wrong the function tries to abort prepared transactions on
+ * the nodes where it succeeded and throws error. A warning is emitted if abort
+ * prepared fails.
+ * After completion remote connection handles are released.
+ */
+static char *
+pgxc_node_remote_prepare(char *prepareGID, bool localNode)
+{
+ bool isOK = true;
+ StringInfoData nodestr;
+ char *prepare_cmd = (char *) palloc (64 + strlen(prepareGID));
+ char *abort_cmd;
+ GlobalTransactionId auxXid;
+ char *commit_cmd = "COMMIT TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ initStringInfo(&nodestr);
+ if (localNode)
+ appendStringInfoString(&nodestr, PGXCNodeName);
+
+ sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /*
+ * If something went wrong already we have nothing to do here. The error
+ * will be reported at the end of the function, and we will rollback
+ * remotes as part of the error handling.
+ * Just skip to clean up section and check if we have already prepared
+ * somewhere, we should abort that prepared transaction.
+ */
+ if (!isOK)
+ goto prepare_err;
+
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ if (conn->read_only)
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, commit_cmd))
+ {
+ /*
+ * not a big deal, it was read only, the connection will be
+ * abandoned later.
+ */
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ else
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, prepare_cmd))
+ {
+ /*
+ * That is the trouble, we really want to prepare it.
+ * Just emit warning so far and go to clean up.
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send PREPARE TRANSACTION command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ char *nodename = get_pgxc_nodename(conn->nodeoid);
+ if (nodestr.len > 0)
+ appendStringInfoChar(&nodestr, ',');
+ appendStringInfoString(&nodestr, nodename);
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ /*
+ * If it fails on remote node it would just return ROLLBACK.
+ * Set the flag for the message handler so the response is
+ * verified.
+ */
+ conn->ck_resp_rollback = true;
+ }
+ }
+ }
+ else if (conn->transaction_status == 'E')
+ {
+ /*
+ * Probably can not happen, if there was a error the engine would
+ * abort anyway, even in case of explicit PREPARE.
+ * Anyway, just in case...
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("remote node %u is in error state", conn->nodeoid)));
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /*
+ * If something went wrong already we have nothing to do here. The error
+ * will be reported at the end of the function, and we will rollback
+ * remotes as part of the error handling.
+ * Just skip to clean up section and check if we have already prepared
+ * somewhere, we should abort that prepared transaction.
+ */
+ if (!isOK)
+ goto prepare_err;
+
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ if (conn->read_only)
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, commit_cmd))
+ {
+ /*
+ * not a big deal, it was read only, the connection will be
+ * abandoned later.
+ */
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ else
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, prepare_cmd))
+ {
+ /*
+ * That is the trouble, we really want to prepare it.
+ * Just emit warning so far and go to clean up.
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send PREPARE TRANSACTION command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ char *nodename = get_pgxc_nodename(conn->nodeoid);
+ if (nodestr.len > 0)
+ appendStringInfoChar(&nodestr, ',');
+ appendStringInfoString(&nodestr, nodename);
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ /*
+ * If it fails on remote node it would just return ROLLBACK.
+ * Set the flag for the message handler so the response is
+ * verified.
+ */
+ conn->ck_resp_rollback = true;
+ }
+ }
+ }
+ else if (conn->transaction_status == 'E')
+ {
+ /*
+ * Probably can not happen, if there was a error the engine would
+ * abort anyway, even in case of explicit PREPARE.
+ * Anyway, just in case...
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("remote node %u is in error state", conn->nodeoid)));
+ }
+ }
+
+ SetSendCommandId(false);
+
+ if (!isOK)
+ goto prepare_err;
+
+ /* exit if nothing has been prepared */
+ if (conn_count > 0)
+ {
+ int result;
+ /*
+ * Receive and check for any errors. In case of errors, we don't bail out
+ * just yet. We first go through the list of connections and look for
+ * errors on each connection. This is important to ensure that we run
+ * an appropriate ROLLBACK command later on (prepared transactions must be
+ * rolled back with ROLLBACK PREPARED commands).
+ *
+ * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on
+ * individual connections. The transaction_status field doesn't get set
+ * every time there is an error on the connection. The combiner mechanism is
+ * good for parallel proessing, but I think we should have a leak-proof
+ * mechanism to track connection status
+ */
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ goto prepare_err;
+ else
+ CloseCombiner(&combiner);
+
+ /* Before exit clean the flag, to avoid unnecessary checks */
+ for (i = 0; i < conn_count; i++)
+ connections[i]->ck_resp_rollback = false;
+
+ pfree_pgxc_all_handles(handles);
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+ }
+
+ pfree(prepare_cmd);
+ return nodestr.data;
+
+prepare_err:
+ abort_cmd = (char *) palloc (64 + strlen(prepareGID));
+ sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
+
+ auxXid = GetAuxilliaryTransactionId();
+ conn_count = 0;
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /*
+ * PREPARE succeeded on that node, roll it back there
+ */
+ if (conn->ck_resp_rollback)
+ {
+ conn->ck_resp_rollback = false;
+
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Error while PREPARING transaction %s on "
+ "node %s. Administrative action may be required "
+ "to abort this transaction on the node",
+ prepareGID, conn->nodename)));
+ continue;
+ }
+
+ /* sanity checks */
+ Assert(conn->sock != NO_SOCKET);
+ /* Send down abort prepared command */
+ if (pgxc_node_send_gxid(conn, auxXid))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send xid to "
+ "the node %u", conn->nodeoid)));
+ }
+ if (pgxc_node_send_query(conn, abort_cmd))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send ABORT PREPARED command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ if (conn->ck_resp_rollback)
+ {
+ conn->ck_resp_rollback = false;
+
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Error while PREPARING transaction %s on "
+ "node %s. Administrative action may be required "
+ "to abort this transaction on the node",
+ prepareGID, conn->nodename)));
+ continue;
+ }
+
+ /* sanity checks */
+ Assert(conn->sock != NO_SOCKET);
+ /* Send down abort prepared command */
+ if (pgxc_node_send_gxid(conn, auxXid))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send xid to "
+ "the node %u", conn->nodeoid)));
+ }
+ if (pgxc_node_send_query(conn, abort_cmd))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send ABORT PREPARED command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+ if (conn_count > 0)
+ {
+ /* Just read out responses, throw error from the first combiner */
+ ResponseCombiner combiner2;
+ InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2);
+ CloseCombiner(&combiner2);
+ }
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+
+ pfree_pgxc_all_handles(handles);
+ pfree(abort_cmd);
+
+ /*
+ * If the flag is set we are here because combiner carries error message
+ */
+ if (isOK)
+ pgxc_node_report_error(&combiner);
+ else
+ elog(ERROR, "failed to PREPARE transaction on one or more nodes");
+ return NULL;
+}
+
+
+/*
+ * Commit transactions on remote nodes.
+ * If barrier lock is set wait while it is released.
+ * Release remote connection after completion.
+ */
+static void
+pgxc_node_remote_commit(void)
+{
+ int result = 0;
+ char *commitCmd = "COMMIT TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ SetSendCommandId(false);
+
+ /*
+ * Barrier:
+ *
+ * We should acquire the BarrierLock in SHARE mode here to ensure that
+ * there are no in-progress barrier at this point. This mechanism would
+ * work as long as LWLock mechanism does not starve a EXCLUSIVE lock
+ * requester
+ */
+ LWLockAcquire(BarrierLock, LW_SHARED);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ /*
+ * We do not need to commit remote node if it is not in transaction.
+ * If transaction is in error state the commit command will cause
+ * rollback, that is OK
+ */
+ if (conn->transaction_status != 'I')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ if (pgxc_node_send_query(conn, commitCmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to the node %u",
+ conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ /*
+ * We do not need to commit remote node if it is not in transaction.
+ * If transaction is in error state the commit command will cause
+ * rollback, that is OK
+ */
+ if (conn->transaction_status != 'I')
+ {
+ if (pgxc_node_send_query(conn, commitCmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to the node %u",
+ conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ /*
+ * Release the BarrierLock.
+ */
+ LWLockRelease(BarrierLock);
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ result = EOF;
+ else
+ CloseCombiner(&combiner);
+ }
+
+ stat_transaction(conn_count);
+
+ if (result)
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to COMMIT the transaction on one or more nodes")));
+ }
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+
+ pfree_pgxc_all_handles(handles);
+}
+
+
+/*
+ * Rollback transactions on remote nodes.
+ * Release remote connection after completion.
+ */
+static void
+pgxc_node_remote_abort(void)
+{
+ int result = 0;
+ char *rollbackCmd = "ROLLBACK TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+ struct timeval timeout;
+
+ SetSendCommandId(false);
+
+ elog(DEBUG5, "pgxc_node_remote_abort - dn_conn_count %d, co_conn_count %d",
+ handles->dn_conn_count, handles->co_conn_count);
+
+ timeout.tv_sec = 60;
+ timeout.tv_usec = 0;
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ elog(DEBUG5, "node %s, conn->transaction_status %c",
+ conn->nodename,
+ conn->transaction_status);
+
+ if (conn->transaction_status != 'I')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ /*
+ * If the remote session was running extended query protocol when
+ * it failed, it will expect a SYNC message before it accepts any
+ * other command
+ */
+ if (conn->needSync)
+ {
+ pgxc_node_send_sync(conn);
+ pgxc_node_receive(1, &conn, &timeout);
+ }
+ /*
+ * Do not matter, is there committed or failed transaction,
+ * just send down rollback to finish it.
+ */
+ if (pgxc_node_send_rollback(conn, rollbackCmd))
+ {
+ add_error_message(conn,
+ "failed to send ROLLBACK TRANSACTION command");
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ if (conn->transaction_status != 'I')
+ {
+ /* Send SYNC if the remote session is expecting one */
+ if (conn->needSync)
+ {
+ pgxc_node_send_sync(conn);
+ pgxc_node_receive(1, &conn, &timeout);
+ }
+ /*
+ * Do not matter, is there committed or failed transaction,
+ * just send down rollback to finish it.
+ */
+ if (pgxc_node_send_rollback(conn, rollbackCmd))
+ {
+ add_error_message(conn,
+ "failed to send ROLLBACK TRANSACTION command");
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, &timeout, &combiner);
+ if (result || !validate_combiner(&combiner))
+ result = EOF;
+ else
+ CloseCombiner(&combiner);
+ }
+
+ stat_transaction(conn_count);
+
+ if (result)
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to ROLLBACK the transaction on one or more nodes")));
+ }
+
+ pfree_pgxc_all_handles(handles);
+}
+
+/*
+ * Begin COPY command
+ * The copy_connections array must have room for NumDataNodes items
+ */
+void
+DataNodeCopyBegin(RemoteCopyData *rcstate)
+{
+ int i;
+ List *nodelist = rcstate->rel_loc->rl_nodeList;
+ PGXCNodeHandle **connections;
+ bool need_tran_block;
+ GlobalTransactionId gxid;
+ ResponseCombiner combiner;
+ Snapshot snapshot = GetActiveSnapshot();
+ int conn_count = list_length(nodelist);
+
+ /* Get needed datanode connections */
+ if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType))
+ {
+ /* Connections is a single handle to read from */
+ connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
+ connections[0] = get_any_handle(nodelist);
+ conn_count = 1;
+ }
+ else
+ {
+ PGXCNodeAllHandles *pgxc_handles;
+ pgxc_handles = get_handles(nodelist, NULL, false, true);
+ connections = pgxc_handles->datanode_handles;
+ Assert(pgxc_handles->dn_conn_count == conn_count);
+ pfree(pgxc_handles);
+ }
+
+ /*
+ * If more than one nodes are involved or if we are already in a
+ * transaction block, we must the remote statements in a transaction block
+ */
+ need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T');
+
+ elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count,
+ need_tran_block ? "true" : "false");
+
+ /* Gather statistics */
+ stat_statement();
+ stat_transaction(conn_count);
+
+ gxid = GetCurrentTransactionId();
+
+ /* Start transaction on connections where it is not started */
+ if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data nodes.")));
+ }
+
+ /*
+ * COPY TO do not use locator, it just takes connections from it, and
+ * we do not look up distribution data type in this case.
+ * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not
+ * defined partType if real locator type is HASH or MODULO.
+ * Create locator before sending down query, because createLocator may
+ * fail and we leave with dirty connections.
+ * If we get an error now datanode connection will be clean and error
+ * handler will issue transaction abort.
+ */
+ rcstate->locator = createLocator(
+ rcstate->is_from ? rcstate->rel_loc->locatorType
+ : LOCATOR_TYPE_RROBIN,
+ rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ,
+ rcstate->dist_type,
+ LOCATOR_LIST_POINTER,
+ conn_count,
+ (void *) connections,
+ NULL,
+ false);
+
+ /* Send query to nodes */
+ for (i = 0; i < conn_count; i++)
+ {
+ CHECK_OWNERSHIP(connections[i], NULL);
+
+ if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot))
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0)
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ }
+
+ /*
+ * We are expecting CopyIn response, but do not want to send it to client,
+ * caller should take care about this, because here we do not know if
+ * client runs console or file copy
+ */
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ /* Receive responses */
+ if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner)
+ || !ValidateAndCloseCombiner(&combiner))
+ {
+ DataNodeCopyFinish(conn_count, connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ pfree(connections);
+}
+
+
+/*
+ * Send a data row to the specified nodes
+ */
+int
+DataNodeCopyIn(char *data_row, int len,
+ int conn_count, PGXCNodeHandle** copy_connections,
+ bool binary)
+{
+ /* size + data row + \n in CSV mode */
+ int msgLen = 4 + len + (binary ? 0 : 1);
+ int nLen = htonl(msgLen);
+ int i;
+
+ for(i = 0; i < conn_count; i++)
+ {
+ PGXCNodeHandle *handle = copy_connections[i];
+ if (handle->state == DN_CONNECTION_STATE_COPY_IN)
+ {
+ /* precalculate to speed up access */
+ int bytes_needed = handle->outEnd + 1 + msgLen;
+
+ /* flush buffer if it is almost full */
+ if (bytes_needed > COPY_BUFFER_SIZE)
+ {
+ int to_send = handle->outEnd;
+
+ /* First look if data node has sent a error message */
+ int read_status = pgxc_node_read_data(handle, true);
+ if (read_status == EOF || read_status < 0)
+ {
+ add_error_message(handle, "failed to read data from data node");
+ return EOF;
+ }
+
+ if (handle->inStart < handle->inEnd)
+ {
+ ResponseCombiner combiner;
+ InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ /*
+ * Validate the combiner but only if we see a proper
+ * resposne for our COPY message. The problem is that
+ * sometimes we might receive async messages such as
+ * 'M' which is used to send back command ID generated and
+ * consumed by the datanode. While the message gets handled
+ * in handle_response(), we don't want to declare receipt
+ * of an invalid message below.
+ *
+ * If there is an actual error of some sort then the
+ * connection state is will be set appropriately and we
+ * shall catch that subsequently.
+ */
+ if (handle_response(handle, &combiner) == RESPONSE_COPY &&
+ !ValidateAndCloseCombiner(&combiner))
+ return EOF;
+ }
+
+ if (DN_CONNECTION_STATE_ERROR(handle))
+ return EOF;
+
+ /*
+ * Try to send down buffered data if we have
+ */
+ if (to_send && send_some(handle, to_send) < 0)
+ {
+ add_error_message(handle, "failed to send data to data node");
+ return EOF;
+ }
+ }
+
+ if (ensure_out_buffer_capacity(bytes_needed, handle) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'd';
+ memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
+ handle->outEnd += 4;
+ memcpy(handle->outBuffer + handle->outEnd, data_row, len);
+ handle->outEnd += len;
+ if (!binary)
+ handle->outBuffer[handle->outEnd++] = '\n';
+
+ handle->in_extended_query = false;
+ }
+ else
+ {
+ add_error_message(handle, "Invalid data node connection");
+ return EOF;
+ }
+ }
+ return 0;
+}
+
+uint64
+DataNodeCopyOut(PGXCNodeHandle** copy_connections,
+ int conn_count, FILE* copy_file)
+{
+ ResponseCombiner combiner;
+ uint64 processed;
+ bool error;
+
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ combiner.processed = 0;
+ /* If there is an existing file where to copy data, pass it to combiner */
+ if (copy_file)
+ {
+ combiner.copy_file = copy_file;
+ combiner.remoteCopyType = REMOTE_COPY_FILE;
+ }
+ else
+ {
+ combiner.copy_file = NULL;
+ combiner.remoteCopyType = REMOTE_COPY_STDOUT;
+ }
+ error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
+
+ processed = combiner.processed;
+
+ if (!ValidateAndCloseCombiner(&combiner) || error)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
+ }
+
+ return processed;
+}
+
+
+uint64
+DataNodeCopyStore(PGXCNodeHandle** copy_connections,
+ int conn_count, Tuplestorestate* store)
+{
+ ResponseCombiner combiner;
+ uint64 processed;
+ bool error;
+
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ combiner.processed = 0;
+ combiner.remoteCopyType = REMOTE_COPY_TUPLESTORE;
+ combiner.tuplestorestate = store;
+
+ error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
+
+ processed = combiner.processed;
+
+ if (!ValidateAndCloseCombiner(&combiner) || error)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
+ }
+
+ return processed;
+}
+
+
+/*
+ * Finish copy process on all connections
+ */
+void
+DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections)
+{
+ int i;
+ ResponseCombiner combiner;
+ bool error = false;
+ for (i = 0; i < conn_count; i++)
+ {
+ PGXCNodeHandle *handle = connections[i];
+
+ error = true;
+ if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT)
+ error = DataNodeCopyEnd(handle, false);
+ }
+
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ error = (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) != 0) || error;
+
+ if (!validate_combiner(&combiner) || error)
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Error while running COPY")));
+ }
+ else
+ CloseCombiner(&combiner);
+}
+
+/*
+ * End copy process on a connection
+ */
+bool
+DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error)
+{
+ int nLen = htonl(4);
+
+ if (handle == NULL)
+ return true;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0)
+ return true;
+
+ if (is_error)
+ handle->outBuffer[handle->outEnd++] = 'f';
+ else
+ handle->outBuffer[handle->outEnd++] = 'c';
+
+ memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
+ handle->outEnd += 4;
+
+ handle->in_extended_query = false;
+ /* We need response right away, so send immediately */
+ if (pgxc_node_flush(handle) < 0)
+ return true;
+
+ return false;
+}
+
+
+/*
+ * Get Node connections depending on the connection type:
+ * Datanodes Only, Coordinators only or both types
+ */
+static PGXCNodeAllHandles *
+get_exec_connections(RemoteQueryState *planstate,
+ ExecNodes *exec_nodes,
+ RemoteQueryExecType exec_type,
+ bool is_global_session)
+{
+ List *nodelist = NIL;
+ List *primarynode = NIL;
+ List *coordlist = NIL;
+ PGXCNodeHandle *primaryconnection;
+ int co_conn_count, dn_conn_count;
+ bool is_query_coord_only = false;
+ PGXCNodeAllHandles *pgxc_handles = NULL;
+
+ /*
+ * If query is launched only on Coordinators, we have to inform get_handles
+ * not to ask for Datanode connections even if list of Datanodes is NIL.
+ */
+ if (exec_type == EXEC_ON_COORDS)
+ is_query_coord_only = true;
+
+ if (exec_type == EXEC_ON_CURRENT)
+ return get_current_handles();
+
+ if (exec_nodes)
+ {
+ if (exec_nodes->en_expr)
+ {
+ /* execution time determining of target Datanodes */
+ bool isnull;
+ ExprState *estate = ExecInitExpr(exec_nodes->en_expr,
+ (PlanState *) planstate);
+ Datum partvalue = ExecEvalExpr(estate,
+ planstate->combiner.ss.ps.ps_ExprContext,
- if (IS_PGXC_LOCAL_COORDINATOR && MyXactAccessedTempRel)
++ &isnull);
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
+ /* PGXCTODO what is the type of partvalue here */
+ ExecNodes *nodes = GetRelationNodes(rel_loc_info,
+ partvalue,
+ isnull,
+ exec_nodes->accesstype);
+ /*
+ * en_expr is set by pgxc_set_en_expr only for distributed
+ * relations while planning DMLs, hence a select for update
+ * on a replicated table here is an assertion
+ */
+ Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE &&
+ IsRelationReplicated(rel_loc_info)));
+
+ if (nodes)
+ {
+ nodelist = nodes->nodeList;
+ primarynode = nodes->primarynodelist;
+ pfree(nodes);
+ }
+ FreeRelationLocInfo(rel_loc_info);
+ }
+ else if (OidIsValid(exec_nodes->en_relid))
+ {
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
+ ExecNodes *nodes = GetRelationNodes(rel_loc_info, 0, true, exec_nodes->accesstype);
+
+ /*
+ * en_relid is set only for DMLs, hence a select for update on a
+ * replicated table here is an assertion
+ */
+ Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE &&
+ IsRelationReplicated(rel_loc_info)));
+
+ /* Use the obtained list for given table */
+ if (nodes)
+ nodelist = nodes->nodeList;
+
+ /*
+ * Special handling for ROUND ROBIN distributed tables. The target
+ * node must be determined at the execution time
+ */
+ if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN && nodes)
+ {
+ nodelist = nodes->nodeList;
+ primarynode = nodes->primarynodelist;
+ }
+ else if (nodes)
+ {
+ if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
+ {
+ nodelist = exec_nodes->nodeList;
+ primarynode = exec_nodes->primarynodelist;
+ }
+ }
+
+ if (nodes)
+ pfree(nodes);
+ FreeRelationLocInfo(rel_loc_info);
+ }
+ else
+ {
+ if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
+ nodelist = exec_nodes->nodeList;
+ else if (exec_type == EXEC_ON_COORDS)
+ coordlist = exec_nodes->nodeList;
+
+ primarynode = exec_nodes->primarynodelist;
+ }
+ }
+
+ /* Set node list and DN number */
+ if (list_length(nodelist) == 0 &&
+ (exec_type == EXEC_ON_ALL_NODES ||
+ exec_type == EXEC_ON_DATANODES))
+ {
+ /* Primary connection is included in this number of connections if it exists */
+ dn_conn_count = NumDataNodes;
+ }
+ else
+ {
+ if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
+ {
+ if (primarynode)
+ dn_conn_count = list_length(nodelist) + 1;
+ else
+ dn_conn_count = list_length(nodelist);
+ }
+ else
+ dn_conn_count = 0;
+ }
+
+ /* Set Coordinator list and Coordinator number */
+ if ((list_length(nodelist) == 0 && exec_type == EXEC_ON_ALL_NODES) ||
+ (list_length(coordlist) == 0 && exec_type == EXEC_ON_COORDS))
+ {
+ coordlist = GetAllCoordNodes();
+ co_conn_count = list_length(coordlist);
+ }
+ else
+ {
+ if (exec_type == EXEC_ON_COORDS)
+ co_conn_count = list_length(coordlist);
+ else
+ co_conn_count = 0;
+ }
+
+ /* Get other connections (non-primary) */
+ pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session);
+ if (!pgxc_handles)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not obtain connection from pool")));
+
+ /* Get connection for primary node, if used */
+ if (primarynode)
+ {
+ /* Let's assume primary connection is always a Datanode connection for the moment */
+ PGXCNodeAllHandles *pgxc_conn_res;
+ pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session);
+
+ /* primary connection is unique */
+ primaryconnection = pgxc_conn_res->datanode_handles[0];
+
+ pfree(pgxc_conn_res);
+
+ if (!primaryconnection)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not obtain connection from pool")));
+ pgxc_handles->primary_handle = primaryconnection;
+ }
+
+ /* Depending on the execution type, we still need to save the initial node counts */
+ pgxc_handles->dn_conn_count = dn_conn_count;
+ pgxc_handles->co_conn_count = co_conn_count;
+
+ return pgxc_handles;
+}
+
+
+static bool
+pgxc_start_command_on_connection(PGXCNodeHandle *connection,
+ RemoteQueryState *remotestate,
+ Snapshot snapshot)
+{
+ CommandId cid;
+ ResponseCombiner *combiner = (ResponseCombiner *) remotestate;
+ RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan;
+ CHECK_OWNERSHIP(connection, combiner);
+
+ elog(DEBUG5, "pgxc_start_command_on_connection - node %s, state %d",
+ connection->nodename, connection->state);
+
+ /*
+ * Scan descriptor would be valid and would contain a valid snapshot
+ * in cases when we need to send out of order command id to data node
+ * e.g. in case of a fetch
+ */
+ cid = GetCurrentCommandId(false);
+
+ if (pgxc_node_send_cmd_id(connection, cid) < 0 )
+ return false;
+
+ if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
+ return false;
+ if (step->statement || step->cursor || remotestate->rqs_num_params)
+ {
+ /* need to use Extended Query Protocol */
+ int fetch = 0;
+ bool prepared = false;
+ char nodetype = PGXC_NODE_DATANODE;
+
+ /* if prepared statement is referenced see if it is already
+ * exist */
+ if (step->statement)
+ prepared =
+ ActivateDatanodeStatementOnNode(step->statement,
+ PGXCNodeGetNodeId(connection->nodeoid,
+ &nodetype));
+
+ /*
+ * execute and fetch rows only if they will be consumed
+ * immediately by the sorter
+ */
+ if (step->cursor)
+ fetch = 1;
+
+ combiner->extended_query = true;
+
+ if (pgxc_node_send_query_extended(connection,
+ prepared ? NULL : step->sql_statement,
+ step->statement,
+ step->cursor,
+ remotestate->rqs_num_params,
+ remotestate->rqs_param_types,
+ remotestate->paramval_len,
+ remotestate->paramval_data,
+ step->has_row_marks ? true : step->read_only,
+ fetch) != 0)
+ return false;
+ }
+ else
+ {
+ combiner->extended_query = false;
+ if (pgxc_node_send_query(connection, step->sql_statement) != 0)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Execute utility statement on multiple Datanodes
+ * It does approximately the same as
+ *
+ * RemoteQueryState *state = ExecInitRemoteQuery(plan, estate, flags);
+ * Assert(TupIsNull(ExecRemoteQuery(state));
+ * ExecEndRemoteQuery(state)
+ *
+ * But does not need an Estate instance and does not do some unnecessary work,
+ * like allocating tuple slots.
+ */
+void
+ExecRemoteUtility(RemoteQuery *node)
+{
+ RemoteQueryState *remotestate;
+ ResponseCombiner *combiner;
+ bool force_autocommit = node->force_autocommit;
+ RemoteQueryExecType exec_type = node->exec_type;
+ GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ Snapshot snapshot = NULL;
+ PGXCNodeAllHandles *pgxc_connections;
+ int co_conn_count;
+ int dn_conn_count;
+ bool need_tran_block;
+ ExecDirectType exec_direct_type = node->exec_direct_type;
+ int i;
+ CommandId cid = GetCurrentCommandId(true);
+
+ if (!force_autocommit)
+ RegisterTransactionLocalNode(true);
+
+ remotestate = makeNode(RemoteQueryState);
+ combiner = (ResponseCombiner *)remotestate;
+ InitResponseCombiner(combiner, 0, node->combine_type);
+
+ /*
+ * Do not set global_session if it is a utility statement.
+ * Avoids CREATE NODE error on cluster configuration.
+ */
+ pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type,
+ exec_direct_type != EXEC_DIRECT_UTILITY);
+
+ dn_conn_count = pgxc_connections->dn_conn_count;
+ co_conn_count = pgxc_connections->co_conn_count;
+ /* exit right away if no nodes to run command on */
+ if (dn_conn_count == 0 && co_conn_count == 0)
+ {
+ pfree_pgxc_all_handles(pgxc_connections);
+ return;
+ }
+
+ if (force_autocommit)
+ need_tran_block = false;
+ else
+ need_tran_block = true;
+
+ /* Commands launched through EXECUTE DIRECT do not need start a transaction */
+ if (exec_direct_type == EXEC_DIRECT_UTILITY)
+ {
+ need_tran_block = false;
+
+ /* This check is not done when analyzing to limit dependencies */
+ if (IsTransactionBlock())
+ ereport(ERROR,
+ (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ errmsg("cannot run EXECUTE DIRECT with utility inside a transaction block")));
+ }
+
+ gxid = GetCurrentTransactionId();
+ if (ActiveSnapshotSet())
+ snapshot = GetActiveSnapshot();
+ if (!GlobalTransactionIdIsValid(gxid))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to get next transaction ID")));
+
+ {
+ if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles,
+ gxid, need_tran_block, false, PGXC_NODE_DATANODE))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on Datanodes")));
+ for (i = 0; i < dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
+
+ if (conn->state == DN_CONNECTION_STATE_QUERY)
+ BufferConnection(conn);
+ if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send snapshot to Datanodes")));
+ }
+ if (pgxc_node_send_cmd_id(conn, cid) < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command ID to Datanodes")));
+ }
+
+ if (pgxc_node_send_query(conn, node->sql_statement) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to Datanodes")));
+ }
+ }
+ }
+
+ {
+ if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles,
+ gxid, need_tran_block, false, PGXC_NODE_COORDINATOR))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on coordinators")));
+ /* Now send it to Coordinators if necessary */
+ for (i = 0; i < co_conn_count; i++)
+ {
+ if (snapshot && pgxc_node_send_snapshot(pgxc_connections->coord_handles[i], snapshot))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to coordinators")));
+ }
+ if (pgxc_node_send_cmd_id(pgxc_connections->coord_handles[i], cid) < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command ID to Datanodes")));
+ }
+
+ if (pgxc_node_send_query(pgxc_connections->coord_handles[i], node->sql_statement) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to coordinators")));
+ }
+ }
+ }
+
+ /*
+ * Stop if all commands are completed or we got a data row and
+ * initialized state node for subsequent invocations
+ */
+ {
+ while (dn_conn_count > 0)
+ {
+ int i = 0;
+
+ if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL))
+ break;
+ /*
+ * Handle input from the Datanodes.
+ * We do not expect Datanodes returning tuples when running utility
+ * command.
+ * If we got EOF, move to the next connection, will receive more
+ * data on the next iteration.
+ */
+ while (i < dn_conn_count)
+ {
+ PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
+ int res = handle_response(conn, combiner);
+ if (res == RESPONSE_EOF)
+ {
+ i++;
+ }
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_READY)
+ {
+ if (i < --dn_conn_count)
+ pgxc_connections->datanode_handles[i] =
+ pgxc_connections->datanode_handles[dn_conn_count];
+ }
+ else if (res == RESPONSE_TUPDESC)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unexpected response from Datanode")));
+ }
+ else if (res == RESPONSE_DATAROW)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unexpected response from Datanode")));
+ }
+ }
+ }
+ }
+
+ /* Make the same for Coordinators */
+ {
+ while (co_conn_count > 0)
+ {
+ int i = 0;
+
+ if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL))
+ break;
+
+ while (i < co_conn_count)
+ {
+ int res = handle_response(pgxc_connections->coord_handles[i], combiner);
+ if (res == RESPONSE_EOF)
+ {
+ i++;
+ }
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_READY)
+ {
+ if (i < --co_conn_count)
+ pgxc_connections->coord_handles[i] =
+ pgxc_connections->coord_handles[co_conn_count];
+ }
+ else if (res == RESPONSE_TUPDESC)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unexpected response from coordinator")));
+ }
+ else if (res == RESPONSE_DATAROW)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unexpected response from coordinator")));
+ }
+ }
+ }
+ }
+
+ /*
+ * We have processed all responses from nodes and if we have
+ * error message pending we can report it. All connections should be in
+ * consistent state now and so they can be released to the pool after ROLLBACK.
+ */
+ pfree_pgxc_all_handles(pgxc_connections);
+ pgxc_node_report_error(combiner);
+}
+
+
+/*
+ * Called when the backend is ending.
+ */
+void
+PGXCNodeCleanAndRelease(int code, Datum arg)
+{
+
+ /* Disconnect from Pooler, if any connection is still held Pooler close it */
+ PoolManagerDisconnect();
+
+ /* Close connection with GTM */
+ CloseGTM();
+
+ /* Dump collected statistics to the log */
+ stat_log();
+}
+
+void
+ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
+{
+ PGXCNodeAllHandles *all_handles;
+ PGXCNodeHandle **connections;
+ ResponseCombiner combiner;
+ int conn_count;
+ int i;
+
+ /* Exit if nodelist is empty */
+ if (list_length(nodelist) == 0)
+ return;
+
+ /* get needed Datanode connections */
+ all_handles = get_handles(nodelist, NIL, false, true);
+ conn_count = all_handles->dn_conn_count;
+ connections = all_handles->datanode_handles;
+
+ for (i = 0; i < conn_count; i++)
+ {
+ if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
+ BufferConnection(connections[i]);
+ if (pgxc_node_send_close(connections[i], true, stmt_name) != 0)
+ {
+ /*
+ * statements are not affected by statement end, so consider
+ * unclosed statement on the Datanode as a fatal issue and
+ * force connection is discarded
+ */
+ PGXCNodeSetConnectionState(connections[i],
+ DN_CONNECTION_STATE_ERROR_FATAL);
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close Datanode statemrnt")));
+ }
+ if (pgxc_node_send_sync(connections[i]) != 0)
+ {
+ PGXCNodeSetConnectionState(connections[i],
+ DN_CONNECTION_STATE_ERROR_FATAL);
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close Datanode statement")));
+ }
+ PGXCNodeSetConnectionState(connections[i], DN_CONNECTION_STATE_CLOSE);
+ }
+
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ while (conn_count > 0)
+ {
+ if (pgxc_node_receive(conn_count, connections, NULL))
+ {
+ for (i = 0; i < conn_count; i++)
+ PGXCNodeSetConnectionState(connections[i],
+ DN_CONNECTION_STATE_ERROR_FATAL);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close Datanode statement")));
+ }
+ i = 0;
+ while (i < conn_count)
+ {
+ int res = handle_response(connections[i], &combiner);
+ if (res == RESPONSE_EOF)
+ {
+ i++;
+ }
+ else if (res == RESPONSE_READY ||
+ connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ {
+ if (--conn_count > i)
+ connections[i] = connections[conn_count];
+ }
+ }
+ }
+
+ ValidateAndCloseCombiner(&combiner);
+ pfree_pgxc_all_handles(all_handles);
+}
+
+/*
+ * DataNodeCopyInBinaryForAll
+ *
+ * In a COPY TO, send to all Datanodes PG_HEADER for a COPY TO in binary mode.
+ */
+int
+DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count,
+ PGXCNodeHandle** connections)
+{
+ int i;
+ int msgLen = 4 + len;
+ int nLen = htonl(msgLen);
+
+ for (i = 0; i < conn_count; i++)
+ {
+ PGXCNodeHandle *handle = connections[i];
+ if (handle->state == DN_CONNECTION_STATE_COPY_IN)
+ {
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'd';
+ memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
+ handle->outEnd += 4;
+ memcpy(handle->outBuffer + handle->outEnd, msg_buf, len);
+ handle->outEnd += len;
+ }
+ else
+ {
+ add_error_message(handle, "Invalid Datanode connection");
+ return EOF;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Encode parameter values to format of DataRow message (the same format is
+ * used in Bind) to prepare for sending down to Datanodes.
+ * The data row is copied to RemoteQueryState.paramval_data.
+ */
+void
+SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state)
+{
+ StringInfoData buf;
+ uint16 n16;
+ int i;
+ int real_num_params = 0;
+ RemoteQuery *node = (RemoteQuery*) rq_state->combiner.ss.ps.plan;
+
+ /* If there are no parameters, there is no data to BIND. */
+ if (!paraminfo)
+ return;
+
+ Assert(!rq_state->paramval_data);
+
+ /*
+ * It is necessary to fetch parameters
+ * before looking at the output value.
+ */
+ for (i = 0; i < paraminfo->numParams; i++)
+ {
+ ParamExternData *param;
+
+ param = ¶minfo->params[i];
+
+ if (!OidIsValid(param->ptype) && paraminfo->paramFetch != NULL)
+ (*paraminfo->paramFetch) (paraminfo, i + 1);
+
+ /*
+ * This is the last parameter found as useful, so we need
+ * to include all the previous ones to keep silent the remote
+ * nodes. All the parameters prior to the last usable having no
+ * type available will be considered as NULL entries.
+ */
+ if (OidIsValid(param->ptype))
+ real_num_params = i + 1;
+ }
+
+ /*
+ * If there are no parameters available, simply leave.
+ * This is possible in the case of a query called through SPI
+ * and using no parameters.
+ */
+ if (real_num_params == 0)
+ {
+ rq_state->paramval_data = NULL;
+ rq_state->paramval_len = 0;
+ return;
+ }
+
+ initStringInfo(&buf);
+
+ /* Number of parameter values */
+ n16 = htons(real_num_params);
+ appendBinaryStringInfo(&buf, (char *) &n16, 2);
+
+ /* Parameter values */
+ for (i = 0; i < real_num_params; i++)
+ {
+ ParamExternData *param = ¶minfo->params[i];
+ uint32 n32;
+
+ /*
+ * Parameters with no types are considered as NULL and treated as integer
+ * The same trick is used for dropped columns for remote DML generation.
+ */
+ if (param->isnull || !OidIsValid(param->ptype))
+ {
+ n32 = htonl(-1);
+ appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ }
+ else
+ {
+ Oid typOutput;
+ bool typIsVarlena;
+ Datum pval;
+ char *pstring;
+ int len;
+
+ /* Get info needed to output the value */
+ getTypeOutputInfo(param->ptype, &typOutput, &typIsVarlena);
+
+ /*
+ * If we have a toasted datum, forcibly detoast it here to avoid
+ * memory leakage inside the type's output routine.
+ */
+ if (typIsVarlena)
+ pval = PointerGetDatum(PG_DETOAST_DATUM(param->value));
+ else
+ pval = param->value;
+
+ /* Convert Datum to string */
+ pstring = OidOutputFunctionCall(typOutput, pval);
+
+ /* copy data to the buffer */
+ len = strlen(pstring);
+ n32 = htonl(len);
+ appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ appendBinaryStringInfo(&buf, pstring, len);
+ }
+ }
+
+
+ /*
+ * If parameter types are not already set, infer them from
+ * the paraminfo.
+ */
+ if (node->rq_num_params > 0)
+ {
+ /*
+ * Use the already known param types for BIND. Parameter types
+ * can be already known when the same plan is executed multiple
+ * times.
+ */
+ if (node->rq_num_params != real_num_params)
+ elog(ERROR, "Number of user-supplied parameters do not match "
+ "the number of remote parameters");
+ rq_state->rqs_num_params = node->rq_num_params;
+ rq_state->rqs_param_types = node->rq_param_types;
+ }
+ else
+ {
+ rq_state->rqs_num_params = real_num_params;
+ rq_state->rqs_param_types = (Oid *) palloc(sizeof(Oid) * real_num_params);
+ for (i = 0; i < real_num_params; i++)
+ rq_state->rqs_param_types[i] = paraminfo->params[i].ptype;
+ }
+
+ /* Assign the newly allocated data row to paramval */
+ rq_state->paramval_data = buf.data;
+ rq_state->paramval_len = buf.len;
+}
+
+/*
+ * Clear per transaction remote information
+ */
+void
+AtEOXact_Remote(void)
+{
+ PGXCNodeResetParams(true);
+}
+
+/*
+ * Invoked when local transaction is about to be committed.
+ * If nodestring is specified commit specified prepared transaction on remote
+ * nodes, otherwise commit remote nodes which are in transaction.
+ */
+void
+PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode)
+{
+ struct rusage start_r;
+ struct timeval start_t;
+
+ if (log_gtm_stats)
+ ResetUsageCommon(&start_r, &start_t);
+
+ /*
+ * Made node connections persistent if we are committing transaction
+ * that touched temporary tables. We never drop that flag, so after some
+ * transaction has created a temp table the session's remote connections
+ * become persistent.
+ * We do not need to set that flag if transaction that has created a temp
+ * table finally aborts - remote connections are not holding temporary
+ * objects in this case.
+ */
- if (MyXactAccessedTempRel)
++ if (IS_PGXC_LOCAL_COORDINATOR &&
++ (MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL))
+ temp_object_included = true;
+
+
+ /*
+ * OK, everything went fine. At least one remote node is in PREPARED state
+ * and the transaction is successfully prepared on all the involved nodes.
+ * Now we are ready to commit the transaction. We need a new GXID to send
+ * down the remote nodes to execute the forthcoming COMMIT PREPARED
+ * command. So grab one from the GTM and track it. It will be closed along
+ * with the main transaction at the end.
+ */
+ if (nodestring)
+ {
+ Assert(preparedLocalNode);
+ pgxc_node_remote_finish(prepareGID, true, nodestring,
+ GetAuxilliaryTransactionId(),
+ GetTopGlobalTransactionId());
+
+ }
+ else
+ pgxc_node_remote_commit();
+
+ if (log_gtm_stats)
+ ShowUsageCommon("PreCommit_Remote", &start_r, &start_t);
+}
+
+/*
+ * Do abort processing for the transaction. We must abort the transaction on
+ * all the involved nodes. If a node has already prepared a transaction, we run
+ * ROLLBACK PREPARED command on the node. Otherwise, a simple ROLLBACK command
+ * is sufficient.
+ *
+ * We must guard against the case when a transaction is prepared succefully on
+ * all the nodes and some error occurs after we send a COMMIT PREPARED message
+ * to at lease one node. Such a transaction must not be aborted to preserve
+ * global consistency. We handle this case by recording the nodes involved in
+ * the transaction at the GTM and keep the transaction open at the GTM so that
+ * its reported as "in-progress" on all the nodes until resolved
+ */
+bool
+PreAbort_Remote(void)
+{
+ /*
+ * We are about to abort current transaction, and there could be an
+ * unexpected error leaving the node connection in some state requiring
+ * clean up, like COPY or pending query results.
+ * If we are running copy we should send down CopyFail message and read
+ * all possible incoming messages, there could be copy rows (if running
+ * COPY TO) ErrorResponse, ReadyForQuery.
+ * If there are pending results (connection state is DN_CONNECTION_STATE_QUERY)
+ * we just need to read them in and discard, all necessary commands are
+ * already sent. The end of input could be CommandComplete or
+ * PortalSuspended, in either case subsequent ROLLBACK closes the portal.
+ */
+ PGXCNodeAllHandles *all_handles;
+ PGXCNodeHandle *clean_nodes[NumCoords + NumDataNodes];
+ int node_count = 0;
+ int cancel_dn_count = 0, cancel_co_count = 0;
+ int cancel_dn_list[NumDataNodes];
+ int cancel_co_list[NumCoords];
+ int i;
+ struct rusage start_r;
+ struct timeval start_t;
+
+ if (log_gtm_stats)
+ ResetUsageCommon(&start_r, &start_t);
+
+ all_handles = get_current_handles();
+ /*
+ * Find "dirty" coordinator connections.
+ * COPY is never running on a coordinator connections, we just check for
+ * pending data.
+ */
+ for (i = 0; i < all_handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = all_handles->coord_handles[i];
+
+ if (handle->state == DN_CONNECTION_STATE_QUERY)
+ {
+ /*
+ * Forget previous combiner if any since input will be handled by
+ * different one.
+ */
+ handle->combiner = NULL;
+ clean_nodes[node_count++] = handle;
+ cancel_co_list[cancel_co_count++] = i;
+ }
+ }
+
+ /*
+ * The same for data nodes, but cancel COPY if it is running.
+ */
+ for (i = 0; i < all_handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = all_handles->datanode_handles[i];
+
+ if (handle->state == DN_CONNECTION_STATE_QUERY)
+ {
+ /*
+ * Forget previous combiner if any since input will be handled by
+ * different one.
+ */
+ handle->combiner = NULL;
+ clean_nodes[node_count++] = handle;
+ cancel_dn_list[cancel_dn_count++] = i;
+ }
+ else if (handle->state == DN_CONNECTION_STATE_COPY_IN ||
+ handle->state == DN_CONNECTION_STATE_COPY_OUT)
+ {
+ DataNodeCopyEnd(handle, true);
+ /*
+ * Forget previous combiner if any since input will be handled by
+ * different one.
+ */
+ handle->combiner = NULL;
+ clean_nodes[node_count++] = handle;
+ cancel_dn_list[cancel_dn_count++] = i;
+ }
+ }
+
+ /*
+ * Cancel running queries on the datanodes and the coordinators.
+ */
+ PoolManagerCancelQuery(cancel_dn_count, cancel_dn_list, cancel_co_count,
+ cancel_co_list);
+
+ /*
+ * Now read and discard any data from the connections found "dirty"
+ */
+ if (node_count > 0)
+ {
+ ResponseCombiner combiner;
+
+ InitResponseCombiner(&combiner, node_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ combiner.connections = clean_nodes;
+ combiner.conn_count = node_count;
+ combiner.request_type = REQUEST_TYPE_ERROR;
+
+ pgxc_connections_cleanup(&combiner);
+
+ /* prevent pfree'ing local variable */
+ combiner.connections = NULL;
+
+ CloseCombiner(&combiner);
+ }
+
+ pgxc_node_remote_abort();
+
+ /*
+ * Drop the connections to ensure aborts are handled properly.
+ *
+ * XXX We should really be consulting PersistentConnections parameter and
+ * keep the connections if its set. But as a short term measure, to address
+ * certain issues for aborted transactions, we drop the connections.
+ * Revisit and fix the issue
+ */
+ elog(DEBUG5, "temp_object_included %d", temp_object_included);
+ if (!temp_object_included)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+
+ pfree_pgxc_all_handles(all_handles);
+
+ if (log_gtm_stats)
+ ShowUsageCommon("PreAbort_Remote", &start_r, &start_t);
+
+ return true;
+}
+
+
+/*
+ * Invoked when local transaction is about to be prepared.
+ * If invoked on a Datanode just commit transaction on remote connections,
+ * since secondary sessions are read only and never need to be prepared.
+ * Otherwise run PREPARE on remote connections, where writable commands were
+ * sent (connections marked as not read-only).
+ * If that is explicit PREPARE (issued by client) notify GTM.
+ * In case of implicit PREPARE not involving local node (ex. caused by
+ * INSERT, UPDATE or DELETE) commit prepared transaction immediately.
+ * Return list of node names where transaction was actually prepared, include
+ * the name of the local node if localNode is true.
+ */
+char *
+PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit)
+{
+ /* Always include local node if running explicit prepare */
+ char *nodestring;
+ struct rusage start_r;
+ struct timeval start_t;
+
+ if (log_gtm_stats)
+ ResetUsageCommon(&start_r, &start_t);
+
+ /*
+ * Primary session is doing 2PC, just commit secondary processes and exit
+ */
+ if (IS_PGXC_DATANODE)
+ {
+ pgxc_node_remote_commit();
+ return NULL;
+ }
+
+ nodestring = pgxc_node_remote_prepare(prepareGID,
+ !implicit || localNode);
+
+ if (!implicit && IS_PGXC_LOCAL_COORDINATOR)
+ /* Save the node list and gid on GTM. */
+ StartPreparedTranGTM(GetTopGlobalTransactionId(), prepareGID,
+ nodestring);
+
+ /*
+ * If no need to commit on local node go ahead and commit prepared
+ * transaction right away.
+ */
+ if (implicit && !localNode && nodestring)
+ {
+ pgxc_node_remote_finish(prepareGID, true, nodestring,
+ GetAuxilliaryTransactionId(),
+ GetTopGlobalTransactionId());
+ pfree(nodestring);
+ nodestring = NULL;
+ }
+
+ if (log_gtm_stats)
+ ShowUsageCommon("PrePrepare_Remote", &start_r, &start_t);
+
+ return nodestring;
+}
+
+/*
+ * Invoked immediately after local node is prepared.
+ * Notify GTM about completed prepare.
+ */
+void
+PostPrepare_Remote(char *prepareGID, bool implicit)
+{
+ struct rusage start_r;
+ struct timeval start_t;
+
+ if (log_gtm_stats)
+ ResetUsageCommon(&start_r, &start_t);
+
+ if (!implicit)
+ PrepareTranGTM(GetTopGlobalTransactionId());
+
+ if (log_gtm_stats)
+ ShowUsageCommon("PostPrepare_Remote", &start_r, &start_t);
+}
+
+/*
+ * Returns true if 2PC is required for consistent commit: if there was write
+ * activity on two or more nodes within current transaction.
+ */
+bool
+IsTwoPhaseCommitRequired(bool localWrite)
+{
+ PGXCNodeAllHandles *handles;
+ bool found = localWrite;
+ int i;
+
+ /* Never run 2PC on Datanode-to-Datanode connection */
+ if (IS_PGXC_DATANODE)
+ return false;
+
- combiner->ss.ps.qual = NIL;
++ if (MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL)
+ {
+ elog(DEBUG1, "Transaction accessed temporary objects - "
+ "2PC will not be used and that can lead to data inconsistencies "
+ "in case of failures");
+ return false;
+ }
+
+ /*
+ * If no XID assigned, no need to run 2PC since neither coordinator nor any
+ * remote nodes did write operation
+ */
+ if (!TransactionIdIsValid(GetTopTransactionIdIfAny()))
+ return false;
+
+ handles = get_current_handles();
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+ if (conn->sock != NO_SOCKET && !conn->read_only &&
+ conn->transaction_status == 'T')
+ {
+ if (found)
+ return true; /* second found */
+ else
+ found = true; /* first found */
+ }
+ }
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+ if (conn->sock != NO_SOCKET && !conn->read_only &&
+ conn->transaction_status == 'T')
+ {
+ if (found)
+ return true; /* second found */
+ else
+ found = true; /* first found */
+ }
+ }
+ return false;
+}
+
+/*
+ * Execute COMMIT/ABORT PREPARED issued by the remote client on remote nodes.
+ * Contacts GTM for the list of involved nodes and for work complete
+ * notification. Returns true if prepared transaction on local node needs to be
+ * finished too.
+ */
+bool
+FinishRemotePreparedTransaction(char *prepareGID, bool commit)
+{
+ char *nodestring;
+ GlobalTransactionId gxid, prepare_gxid;
+ bool prepared_local = false;
+
+ /*
+ * Get the list of nodes involved in this transaction.
+ *
+ * This function returns the GXID of the prepared transaction. It also
+ * returns a fresh GXID which can be used for running COMMIT PREPARED
+ * commands on the remote nodes. Both these GXIDs can then be either
+ * committed or aborted together.
+ *
+ * XXX While I understand that we get the prepared and a new GXID with a
+ * single call, it doesn't look nicer and create confusion. We should
+ * probably split them into two parts. This is used only for explicit 2PC
+ * which should not be very common in XC
+ *
+ * In xc_maintenance_mode mode, we don't fail if the GTM does not have
+ * knowledge about the prepared transaction. That may happen for various
+ * reasons such that an earlier attempt cleaned up it from GTM or GTM was
+ * restarted in between. The xc_maintenance_mode is a kludge to come out of
+ * such situations. So it seems alright to not be too strict about the
+ * state
+ */
+ if ((GetGIDDataGTM(prepareGID, &gxid, &prepare_gxid, &nodestring) < 0) &&
+ !xc_maintenance_mode)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("prepared transaction with identifier \"%s\" does not exist",
+ prepareGID)));
+
+ /*
+ * Please note that with xc_maintenance_mode = on, COMMIT/ROLLBACK PREPARED will not
+ * propagate to remote nodes. Only GTM status is cleaned up.
+ */
+ if (xc_maintenance_mode)
+ {
+ if (commit)
+ {
+ pgxc_node_remote_commit();
+ CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL);
+ }
+ else
+ {
+ pgxc_node_remote_abort();
+ RollbackTranGTM(prepare_gxid);
+ RollbackTranGTM(gxid);
+ }
+ return false;
+ }
+
+ prepared_local = pgxc_node_remote_finish(prepareGID, commit, nodestring,
+ gxid, prepare_gxid);
+
+ if (commit)
+ {
+ /*
+ * XXX For explicit 2PC, there will be enough delay for any
+ * waited-committed transactions to send a final COMMIT message to the
+ * GTM.
+ */
+ CommitPreparedTranGTM(prepare_gxid, gxid, 0, NULL);
+ }
+ else
+ {
+ RollbackTranGTM(prepare_gxid);
+ RollbackTranGTM(gxid);
+ }
+
+ return prepared_local;
+}
+
+
+/*
+ * Complete previously prepared transactions on remote nodes.
+ * Release remote connection after completion.
+ */
+static bool
+pgxc_node_remote_finish(char *prepareGID, bool commit,
+ char *nodestring, GlobalTransactionId gxid,
+ GlobalTransactionId prepare_gxid)
+{
+ char *finish_cmd;
+ PGXCNodeHandle *connections[MaxCoords + MaxDataNodes];
+ int conn_count = 0;
+ ResponseCombiner combiner;
+ PGXCNodeAllHandles *pgxc_handles;
+ bool prepared_local = false;
+ char *nodename;
+ List *nodelist = NIL;
+ List *coordlist = NIL;
+ int i;
+ /*
+ * Now based on the nodestring, run COMMIT/ROLLBACK PREPARED command on the
+ * remote nodes and also finish the transaction locally is required
+ */
+ nodename = strtok(nodestring, ",");
+ while (nodename != NULL)
+ {
+ int nodeIndex;
+ char nodetype;
+
+ /* Get node type and index */
+ nodetype = PGXC_NODE_NONE;
+ nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
+ if (nodetype == PGXC_NODE_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ nodename)));
+
+ /* Check if node is requested is the self-node or not */
+ if (nodetype == PGXC_NODE_COORDINATOR)
+ {
+ if (nodeIndex == PGXCNodeId - 1)
+ prepared_local = true;
+ else
+ coordlist = lappend_int(coordlist, nodeIndex);
+ }
+ else
+ nodelist = lappend_int(nodelist, nodeIndex);
+
+ nodename = strtok(NULL, ",");
+ }
+
+ if (nodelist == NIL && coordlist == NIL)
+ return prepared_local;
+
+ pgxc_handles = get_handles(nodelist, coordlist, false, true);
+
+ finish_cmd = (char *) palloc(64 + strlen(prepareGID));
+
+ if (commit)
+ sprintf(finish_cmd, "COMMIT PREPARED '%s'", prepareGID);
+ else
+ sprintf(finish_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
+
+ for (i = 0; i < pgxc_handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = pgxc_handles->datanode_handles[i];
+
+ if (pgxc_node_send_gxid(conn, gxid))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send GXID for %s PREPARED command",
+ commit ? "COMMIT" : "ROLLBACK")));
+ }
+
+ if (pgxc_node_send_query(conn, finish_cmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send %s PREPARED command to the node %u",
+ commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+
+ for (i = 0; i < pgxc_handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = pgxc_handles->coord_handles[i];
+
+ if (pgxc_node_send_gxid(conn, gxid))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send GXID for %s PREPARED command",
+ commit ? "COMMIT" : "ROLLBACK")));
+ }
+
+ if (pgxc_node_send_query(conn, finish_cmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send %s PREPARED command to the node %u",
+ commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+ !validate_combiner(&combiner))
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to COMMIT the transaction on one or more nodes")));
+ }
+ else
+ CloseCombiner(&combiner);
+ }
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+
+ pfree_pgxc_all_handles(pgxc_handles);
+ pfree(finish_cmd);
+
+ return prepared_local;
+}
+
+/*****************************************************************************
+ *
+ * Simplified versions of ExecInitRemoteQuery, ExecRemoteQuery and
+ * ExecEndRemoteQuery: in XCP they are only used to execute simple queries.
+ *
+ *****************************************************************************/
+RemoteQueryState *
+ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
+{
+ RemoteQueryState *remotestate;
+ ResponseCombiner *combiner;
+
+ remotestate = makeNode(RemoteQueryState);
+ combiner = (ResponseCombiner *) remotestate;
+ InitResponseCombiner(combiner, 0, node->combine_type);
+ combiner->ss.ps.plan = (Plan *) node;
+ combiner->ss.ps.state = estate;
+
- true, resultslot, NULL))
++ combiner->ss.ps.qual = NULL;
+
+ combiner->request_type = REQUEST_TYPE_QUERY;
+
+ ExecInitResultTupleSlot(estate, &combiner->ss.ps);
+ ExecAssignResultTypeFromTL((PlanState *) remotestate);
+
+ /*
+ * If there are parameters supplied, get them into a form to be sent to the
+ * Datanodes with bind message. We should not have had done this before.
+ */
+ SetDataRowForExtParams(estate->es_param_list_info, remotestate);
+
+ /* We need expression context to evaluate */
+ if (node->exec_nodes && node->exec_nodes->en_expr)
+ {
+ Expr *expr = node->exec_nodes->en_expr;
+
+ if (IsA(expr, Var) && ((Var *) expr)->vartype == TIDOID)
+ {
+ /* Special case if expression does not need to be evaluated */
+ }
+ else
+ {
+ /* prepare expression evaluation */
+ ExecAssignExprContext(estate, &combiner->ss.ps);
+ }
+ }
+
+ return remotestate;
+}
+
+
+/*
+ * Execute step of PGXC plan.
+ * The step specifies a command to be executed on specified nodes.
+ * On first invocation connections to the data nodes are initialized and
+ * command is executed. Further, as well as within subsequent invocations,
+ * responses are received until step is completed or there is a tuple to emit.
+ * If there is a tuple it is returned, otherwise returned NULL. The NULL result
+ * from the function indicates completed step.
+ * The function returns at most one tuple per invocation.
+ */
+TupleTableSlot *
+ExecRemoteQuery(RemoteQueryState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+ RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan;
+ TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
+
+ if (!node->query_Done)
+ {
+ GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ Snapshot snapshot = GetActiveSnapshot();
+ PGXCNodeHandle **connections = NULL;
+ PGXCNodeHandle *primaryconnection = NULL;
+ int i;
+ int regular_conn_count = 0;
+ int total_conn_count = 0;
+ bool need_tran_block;
+ PGXCNodeAllHandles *pgxc_connections;
+
+ /*
+ * Get connections for Datanodes only, utilities and DDLs
+ * are launched in ExecRemoteUtility
+ */
+ pgxc_connections = get_exec_connections(node, step->exec_nodes,
+ step->exec_type,
+ true);
+
+ if (step->exec_type == EXEC_ON_DATANODES)
+ {
+ connections = pgxc_connections->datanode_handles;
+ total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count;
+ }
+ else if (step->exec_type == EXEC_ON_COORDS)
+ {
+ connections = pgxc_connections->coord_handles;
+ total_conn_count = regular_conn_count = pgxc_connections->co_conn_count;
+ }
+
+ primaryconnection = pgxc_connections->primary_handle;
+
+ /*
+ * Primary connection is counted separately but is included in total_conn_count if used.
+ */
+ if (primaryconnection)
+ regular_conn_count--;
+
+ /*
+ * We save only regular connections, at the time we exit the function
+ * we finish with the primary connection and deal only with regular
+ * connections on subsequent invocations
+ */
+ combiner->node_count = regular_conn_count;
+
+ /*
+ * Start transaction on data nodes if we are in explicit transaction
+ * or going to use extended query protocol or write to multiple nodes
+ */
+ if (step->force_autocommit)
+ need_tran_block = false;
+ else
+ need_tran_block = step->cursor ||
+ (!step->read_only && total_conn_count > 1) ||
+ (TransactionBlockStatusCode() == 'T');
+
+ stat_statement();
+ stat_transaction(total_conn_count);
+
+ gxid = GetCurrentTransactionIdIfAny();
+ /* See if we have a primary node, execute on it first before the others */
+ if (primaryconnection)
+ {
+ if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block,
+ step->read_only, PGXC_NODE_DATANODE))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data node.")));
+
+ /* If explicit transaction is needed gxid is already sent */
+ if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot))
+ {
+ pgxc_node_remote_abort();
+ pfree_pgxc_all_handles(pgxc_connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+ Assert(combiner->combine_type == COMBINE_TYPE_SAME);
+
+ pgxc_node_receive(1, &primaryconnection, NULL);
+ /* Make sure the command is completed on the primary node */
+ while (true)
+ {
+ int res = handle_response(primaryconnection, combiner);
+ if (res == RESPONSE_READY)
+ break;
+ else if (res == RESPONSE_EOF)
+ pgxc_node_receive(1, &primaryconnection, NULL);
+ else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR)
+ /* Get ReadyForQuery */
+ continue;
+ else if (res == RESPONSE_ASSIGN_GXID)
+ continue;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unexpected response from data node")));
+ }
+ if (combiner->errorMessage)
+ pgxc_node_report_error(combiner);
+ }
+
+ for (i = 0; i < regular_conn_count; i++)
+ {
+ if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block,
+ step->read_only, PGXC_NODE_DATANODE))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data node.")));
+
+ /* If explicit transaction is needed gxid is already sent */
+ if (!pgxc_start_command_on_connection(connections[i], node, snapshot))
+ {
+ pgxc_node_remote_abort();
+ pfree_pgxc_all_handles(pgxc_connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+ connections[i]->combiner = combiner;
+ }
+
+ if (step->cursor)
+ {
+ combiner->cursor = step->cursor;
+ combiner->cursor_count = regular_conn_count;
+ combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *));
+ memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *));
+ }
+
+ combiner->connections = connections;
+ combiner->conn_count = regular_conn_count;
+ combiner->current_conn = 0;
+
+ if (combiner->cursor_count)
+ {
+ combiner->conn_count = combiner->cursor_count;
+ memcpy(connections, combiner->cursor_connections,
+ combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ combiner->connections = connections;
+ }
+
+ node->query_Done = true;
+
+ if (step->sort)
+ {
+ SimpleSort *sort = step->sort;
+
+ /*
+ * First message is already in the buffer
+ * Further fetch will be under tuplesort control
+ * If query does not produce rows tuplesort will not
+ * be initialized
+ */
+ combiner->tuplesortstate = tuplesort_begin_merge(
+ resultslot->tts_tupleDescriptor,
+ sort->numCols,
+ sort->sortColIdx,
+ sort->sortOperators,
+ sort->sortCollations,
+ sort->nullsFirst,
+ combiner,
+ work_mem);
+ }
+ }
+
+ if (combiner->tuplesortstate)
+ {
+ if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
- combiner->ss.ps.qual = NIL;
++ true, true, resultslot, NULL))
+ return resultslot;
+ else
+ ExecClearTuple(resultslot);
+ }
+ else
+ {
+ TupleTableSlot *slot = FetchTuple(combiner);
+ if (!TupIsNull(slot))
+ return slot;
+ }
+
+ if (combiner->errorMessage)
+ pgxc_node_report_error(combiner);
+
+ return NULL;
+}
+
+
+/*
+ * Clean up and discard any data on the data node connections that might not
+ * handled yet, including pending on the remote connection.
+ */
+static void
+pgxc_connections_cleanup(ResponseCombiner *combiner)
+{
+ /* clean up the buffer */
+ list_free_deep(combiner->rowBuffer);
+ combiner->rowBuffer = NIL;
+
+ /*
+ * Read in and discard remaining data from the connections, if any
+ */
+ combiner->current_conn = 0;
+ while (combiner->conn_count > 0)
+ {
+ int res;
+ PGXCNodeHandle *conn = combiner->connections[combiner->current_conn];
+
+ /*
+ * Possible if we are doing merge sort.
+ * We can do usual procedure and move connections around since we are
+ * cleaning up and do not care what connection at what position
+ */
+ if (conn == NULL)
+ {
+ REMOVE_CURR_CONN(combiner);
+ continue;
+ }
+
+ /* throw away current message that may be in the buffer */
+ if (combiner->currentRow)
+ {
+ pfree(combiner->currentRow);
+ combiner->currentRow = NULL;
+ }
+
+ /* no data is expected */
+ if (conn->state == DN_CONNECTION_STATE_IDLE ||
+ conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ {
+ REMOVE_CURR_CONN(combiner);
+ continue;
+ }
+
+ /*
+ * Connection owner is different, so no our data pending at
+ * the connection, nothing to read in.
+ */
+ if (conn->combiner && conn->combiner != combiner)
+ {
+ REMOVE_CURR_CONN(combiner);
+ continue;
+ }
+
+ res = handle_response(conn, combiner);
+ if (res == RESPONSE_EOF)
+ {
+ struct timeval timeout;
+ timeout.tv_sec = END_QUERY_TIMEOUT / 1000;
+ timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000;
+
+ if (pgxc_node_receive(1, &conn, &timeout))
+ elog(LOG, "Failed to read response from data nodes when ending query");
+ }
+ }
+
+ /*
+ * Release tuplesort resources
+ */
+ if (combiner->tuplesortstate)
+ {
+ /*
+ * Free these before tuplesort_end, because these arrays may appear
+ * in the tuplesort's memory context, tuplesort_end deletes this
+ * context and may invalidate the memory.
+ * We still want to free them here, because these may be in different
+ * context.
+ */
+ if (combiner->tapenodes)
+ {
+ pfree(combiner->tapenodes);
+ combiner->tapenodes = NULL;
+ }
+ if (combiner->tapemarks)
+ {
+ pfree(combiner->tapemarks);
+ combiner->tapemarks = NULL;
+ }
+ /*
+ * tuplesort_end invalidates minimal tuple if it is in the slot because
+ * deletes the TupleSort memory context, causing seg fault later when
+ * releasing tuple table
+ */
+ ExecClearTuple(combiner->ss.ps.ps_ResultTupleSlot);
+ tuplesort_end((Tuplesortstate *) combiner->tuplesortstate);
+ combiner->tuplesortstate = NULL;
+ }
+}
+
+
+/*
+ * End the remote query
+ */
+void
+ExecEndRemoteQuery(RemoteQueryState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+
+ /*
+ * Clean up remote connections
+ */
+ pgxc_connections_cleanup(combiner);
+
+ /*
+ * Clean up parameters if they were set, since plan may be reused
+ */
+ if (node->paramval_data)
+ {
+ pfree(node->paramval_data);
+ node->paramval_data = NULL;
+ node->paramval_len = 0;
+ }
+
+ CloseCombiner(combiner);
+ pfree(node);
+}
+
+
+/**********************************************
+ *
+ * Routines to support RemoteSubplan plan node
+ *
+ **********************************************/
+
+
+/*
+ * The routine walks recursively over the plan tree and changes cursor names of
+ * RemoteSubplan nodes to make them different from launched from the other
+ * datanodes. The routine changes cursor names in place, so caller should
+ * take writable copy of the plan tree.
+ */
+void
+RemoteSubplanMakeUnique(Node *plan, int unique)
+{
+ if (plan == NULL)
+ return;
+
+ if (IsA(plan, List))
+ {
+ ListCell *lc;
+ foreach(lc, (List *) plan)
+ {
+ RemoteSubplanMakeUnique(lfirst(lc), unique);
+ }
+ return;
+ }
+
+ /*
+ * Transform SharedQueue name
+ */
+ if (IsA(plan, RemoteSubplan))
+ {
+ ((RemoteSubplan *)plan)->unique = unique;
+ }
+ /* Otherwise it is a Plan descendant */
+ RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique);
+ RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique);
+ /* Tranform special cases */
+ switch (nodeTag(plan))
+ {
+ case T_Append:
+ RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans,
+ unique);
+ break;
+ case T_MergeAppend:
+ RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans,
+ unique);
+ break;
+ case T_BitmapAnd:
+ RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans,
+ unique);
+ break;
+ case T_BitmapOr:
+ RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans,
+ unique);
+ break;
+ case T_SubqueryScan:
+ RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan,
+ unique);
+ break;
+ default:
+ break;
+ }
+}
+
+struct find_params_context
+{
+ RemoteParam *rparams;
+ Bitmapset *defineParams;
+};
+
+static bool
+determine_param_types_walker(Node *node, struct find_params_context *context)
+{
+ if (node == NULL)
+ return false;
+
+ if (IsA(node, Param))
+ {
+ Param *param = (Param *) node;
+ int paramno = param->paramid;
+
+ if (param->paramkind == PARAM_EXEC &&
+ bms_is_member(paramno, context->defineParams))
+ {
+ RemoteParam *cur = context->rparams;
+ while (cur->paramkind != PARAM_EXEC || cur->paramid != paramno)
+ cur++;
+ cur->paramtype = param->paramtype;
+ context->defineParams = bms_del_member(context->defineParams,
+ paramno);
+ return bms_is_empty(context->defineParams);
+ }
+ }
+ return expression_tree_walker(node, determine_param_types_walker,
+ (void *) context);
+
+}
+
+/*
+ * Scan expressions in the plan tree to find Param nodes and get data types
+ * from them
+ */
+static bool
+determine_param_types(Plan *plan, struct find_params_context *context)
+{
+ Bitmapset *intersect;
+
+ if (plan == NULL)
+ return false;
+
+ intersect = bms_intersect(plan->allParam, context->defineParams);
+ if (bms_is_empty(intersect))
+ {
+ /* the subplan does not depend on params we are interested in */
+ bms_free(intersect);
+ return false;
+ }
+ bms_free(intersect);
+
+ /* scan target list */
+ if (expression_tree_walker((Node *) plan->targetlist,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ /* scan qual */
+ if (expression_tree_walker((Node *) plan->qual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+
+ /* Check additional node-type-specific fields */
+ switch (nodeTag(plan))
+ {
+ case T_Result:
+ if (expression_tree_walker((Node *) ((Result *) plan)->resconstantqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_SeqScan:
+ case T_SampleScan:
+ case T_CteScan:
+ break;
+
+ case T_IndexScan:
+ if (expression_tree_walker((Node *) ((IndexScan *) plan)->indexqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_IndexOnlyScan:
+ if (expression_tree_walker((Node *) ((IndexOnlyScan *) plan)->indexqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_BitmapIndexScan:
+ if (expression_tree_walker((Node *) ((BitmapIndexScan *) plan)->indexqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_BitmapHeapScan:
+ if (expression_tree_walker((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_TidScan:
+ if (expression_tree_walker((Node *) ((TidScan *) plan)->tidquals,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_SubqueryScan:
+ if (determine_param_types(((SubqueryScan *) plan)->subplan, context))
+ return true;
+ break;
+
+ case T_FunctionScan:
+ if (expression_tree_walker((Node *) ((FunctionScan *) plan)->functions,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_ValuesScan:
+ if (expression_tree_walker((Node *) ((ValuesScan *) plan)->values_lists,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_ModifyTable:
+ {
+ ListCell *l;
+
+ foreach(l, ((ModifyTable *) plan)->plans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_RemoteSubplan:
+ break;
+
+ case T_Append:
+ {
+ ListCell *l;
+
+ foreach(l, ((Append *) plan)->appendplans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_MergeAppend:
+ {
+ ListCell *l;
+
+ foreach(l, ((MergeAppend *) plan)->mergeplans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_BitmapAnd:
+ {
+ ListCell *l;
+
+ foreach(l, ((BitmapAnd *) plan)->bitmapplans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_BitmapOr:
+ {
+ ListCell *l;
+
+ foreach(l, ((BitmapOr *) plan)->bitmapplans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_NestLoop:
+ if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_MergeJoin:
+ if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ if (expression_tree_walker((Node *) ((MergeJoin *) plan)->mergeclauses,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_HashJoin:
+ if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ if (expression_tree_walker((Node *) ((HashJoin *) plan)->hashclauses,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_Limit:
+ if (expression_tree_walker((Node *) ((Limit *) plan)->limitOffset,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ if (expression_tree_walker((Node *) ((Limit *) plan)->limitCount,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_RecursiveUnion:
+ break;
+
+ case T_LockRows:
+ break;
+
+ case T_WindowAgg:
+ if (expression_tree_walker((Node *) ((WindowAgg *) plan)->startOffset,
+ determine_param_types_walker,
+ (void *) context))
+ if (expression_tree_walker((Node *) ((WindowAgg *) plan)->endOffset,
+ determine_param_types_walker,
+ (void *) context))
+ break;
+
+ case T_Hash:
+ case T_Agg:
+ case T_Material:
+ case T_Sort:
+ case T_Unique:
+ case T_SetOp:
+ case T_Group:
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(plan));
+ }
+
+
+ /* recurse into subplans */
+ return determine_param_types(plan->lefttree, context) ||
+ determine_param_types(plan->righttree, context);
+}
+
+
+RemoteSubplanState *
+ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
+{
+ RemoteStmt rstmt;
+ RemoteSubplanState *remotestate;
+ ResponseCombiner *combiner;
+ CombineType combineType;
+ struct rusage start_r;
+ struct timeval start_t;
+
+ if (log_remotesubplan_stats)
+ ResetUsageCommon(&start_r, &start_t);
+
+ remotestate = makeNode(RemoteSubplanState);
+ combiner = (ResponseCombiner *) remotestate;
+ /*
+ * We do not need to combine row counts if we will receive intermediate
+ * results or if we won't return row count.
+ */
+ if (IS_PGXC_DATANODE || estate->es_plannedstmt->commandType == CMD_SELECT)
+ {
+ combineType = COMBINE_TYPE_NONE;
+ remotestate->execOnAll = node->execOnAll;
+ }
+ else
+ {
+ if (node->execOnAll)
+ combineType = COMBINE_TYPE_SUM;
+ else
+ combineType = COMBINE_TYPE_SAME;
+ /*
+ * If we are updating replicated table we should run plan on all nodes.
+ * We are choosing single node only to read
+ */
+ remotestate->execOnAll = true;
+ }
+ remotestate->execNodes = list_copy(node->nodeList);
+ InitResponseCombiner(combiner, 0, combineType);
+ combiner->ss.ps.plan = (Plan *) node;
+ combiner->ss.ps.state = estate;
+
- true, resultslot, NULL))
++ combiner->ss.ps.qual = NULL;
+
+ combiner->request_type = REQUEST_TYPE_QUERY;
+
+ ExecInitResultTupleSlot(estate, &combiner->ss.ps);
+ ExecAssignResultTypeFromTL((PlanState *) remotestate);
+
+ /*
+ * We optimize execution if we going to send down query to next level
+ */
+ remotestate->local_exec = false;
+ if (IS_PGXC_DATANODE)
+ {
+ if (remotestate->execNodes == NIL)
+ {
+ /*
+ * Special case, if subplan is not distributed, like Result, or
+ * query against catalog tables only.
+ * We are only interested in filtering out the subplan results and
+ * get only those we are interested in.
+ * XXX we may want to prevent multiple executions in this case
+ * either, to achieve this we will set single execNode on planning
+ * time and this case would never happen, this code branch could
+ * be removed.
+ */
+ remotestate->local_exec = true;
+ }
+ else if (!remotestate->execOnAll)
+ {
+ /*
+ * XXX We should change planner and remove this flag.
+ * We want only one node is producing the replicated result set,
+ * and planner should choose that node - it is too hard to determine
+ * right node at execution time, because it should be guaranteed
+ * that all consumers make the same decision.
+ * For now always execute replicated plan on local node to save
+ * resources.
+ */
+
+ /*
+ * Make sure local node is in execution list
+ */
+ if (list_member_int(remotestate->execNodes, PGXCNodeId-1))
+ {
+ list_free(remotestate->execNodes);
+ remotestate->execNodes = NIL;
+ remotestate->local_exec = true;
+ }
+ else
+ {
+ /*
+ * To support, we need to connect to some producer, so
+ * each producer should be prepared to serve rows for random
+ * number of consumers. It is hard, because new consumer may
+ * connect after producing is started, on the other hand,
+ * absence of expected consumer is a problem too.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Getting replicated results from remote node is not supported")));
+ }
+ }
+ }
+
+ /*
+ * If we are going to execute subplan locally or doing explain initialize
+ * the subplan. Otherwise have remote node doing that.
+ */
+ if (remotestate->local_exec || (eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ {
+ outerPlanState(remotestate) = ExecInitNode(outerPlan(node), estate,
+ eflags);
+ if (node->distributionNodes)
+ {
+ Oid distributionType = InvalidOid;
+ TupleDesc typeInfo;
+
+ typeInfo = combiner->ss.ps.ps_ResultTupleSlot->tts_tupleDescriptor;
+ if (node->distributionKey != InvalidAttrNumber)
+ {
+ Form_pg_attribute attr;
+ attr = typeInfo->attrs[node->distributionKey - 1];
+ distributionType = attr->atttypid;
+ }
+ /* Set up locator */
+ remotestate->locator = createLocator(node->distributionType,
+ RELATION_ACCESS_INSERT,
+ distributionType,
+ LOCATOR_LIST_LIST,
+ 0,
+ (void *) node->distributionNodes,
+ (void **) &remotestate->dest_nodes,
+ false);
+ }
+ else
+ remotestate->locator = NULL;
+ }
+
+ /*
+ * Encode subplan if it will be sent to remote nodes
+ */
+ if (remotestate->execNodes && !(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ {
+ ParamListInfo ext_params;
+ /* Encode plan if we are going to execute it on other nodes */
+ rstmt.type = T_RemoteStmt;
+ if (node->distributionType == LOCATOR_TYPE_NONE && IS_PGXC_DATANODE)
+ {
+ /*
+ * There are cases when planner can not determine distribution of a
+ * subplan, in particular it does not determine distribution of
+ * subquery nodes. Such subplans executed from current location
+ * (node) and combine all results, like from coordinator nodes.
+ * However, if there are multiple locations where distributed
+ * executor is running this node, and there are more of
+ * RemoteSubplan plan nodes in the subtree there will be a problem -
+ * Instances of the inner RemoteSubplan nodes will be using the same
+ * SharedQueue, causing error. To avoid this problem we should
+ * traverse the subtree and change SharedQueue name to make it
+ * unique.
+ */
+ RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId);
+ }
+ rstmt.planTree = outerPlan(node);
+ /*
+ * If datanode launch further execution of a command it should tell
+ * it is a SELECT, otherwise secondary data nodes won't return tuples
+ * expecting there will be nothing to return.
+ */
+ if (IsA(outerPlan(node), ModifyTable))
+ {
+ rstmt.commandType = estate->es_plannedstmt->commandType;
+ rstmt.hasReturning = estate->es_plannedstmt->hasReturning;
+ rstmt.resultRelations = estate->es_plannedstmt->resultRelations;
+ }
+ else
+ {
+ rstmt.commandType = CMD_SELECT;
+ rstmt.hasReturning = false;
+ rstmt.resultRelations = NIL;
+ }
+ rstmt.rtable = estate->es_range_table;
+ rstmt.subplans = estate->es_plannedstmt->subplans;
+ rstmt.nParamExec = estate->es_plannedstmt->nParamExec;
+ ext_params = estate->es_param_list_info;
+ rstmt.nParamRemote = (ext_params ? ext_params->numParams : 0) +
+ bms_num_members(node->scan.plan.allParam);
+ if (rstmt.nParamRemote > 0)
+ {
+ Bitmapset *tmpset;
+ int i;
+ int paramno;
+
+ /* Allocate enough space */
+ rstmt.remoteparams = (RemoteParam *) palloc(rstmt.nParamRemote *
+ sizeof(RemoteParam));
+ paramno = 0;
+ if (ext_params)
+ {
+ for (i = 0; i < ext_params->numParams; i++)
+ {
+ ParamExternData *param = &ext_params->params[i];
+ /*
+ * If parameter type is not yet defined but can be defined
+ * do that
+ */
+ if (!OidIsValid(param->ptype) && ext_params->paramFetch)
+ (*ext_params->paramFetch) (ext_params, i + 1);
+
+ /*
+ * If the parameter type is still not defined, assume that
+ * it is unused. But we put a default INT4OID type for such
+ * unused parameters to keep the parameter pushdown code
+ * happy.
+ *
+ * These unused parameters are never accessed during
+ * execution and we will just a null value for these
+ * "dummy" parameters. But including them here ensures that
+ * we send down the parameters in the correct order and at
+ * the position that the datanode needs
+ */
+ if (OidIsValid(param->ptype))
+ {
+ rstmt.remoteparams[paramno].paramused = 1;
+ rstmt.remoteparams[paramno].paramtype = param->ptype;
+ }
+ else
+ {
+ rstmt.remoteparams[paramno].paramused = 0;
+ rstmt.remoteparams[paramno].paramtype = INT4OID;
+ }
+
+ rstmt.remoteparams[paramno].paramkind = PARAM_EXTERN;
+ rstmt.remoteparams[paramno].paramid = i + 1;
+ paramno++;
+ }
+ /* store actual number of parameters */
+ rstmt.nParamRemote = paramno;
+ }
+
+ if (!bms_is_empty(node->scan.plan.allParam))
+ {
+ Bitmapset *defineParams = NULL;
+ tmpset = bms_copy(node->scan.plan.allParam);
+ while ((i = bms_first_member(tmpset)) >= 0)
+ {
+ ParamExecData *prmdata;
+
+ prmdata = &(estate->es_param_exec_vals[i]);
+ rstmt.remoteparams[paramno].paramkind = PARAM_EXEC;
+ rstmt.remoteparams[paramno].paramid = i;
+ rstmt.remoteparams[paramno].paramtype = prmdata->ptype;
+ rstmt.remoteparams[paramno].paramused = 1;
+ /* Will scan plan tree to find out data type of the param */
+ if (prmdata->ptype == InvalidOid)
+ defineParams = bms_add_member(defineParams, i);
+ paramno++;
+ }
+ /* store actual number of parameters */
+ rstmt.nParamRemote = paramno;
+ bms_free(tmpset);
+ if (!bms_is_empty(defineParams))
+ {
+ struct find_params_context context;
+ bool all_found;
+
+ context.rparams = rstmt.remoteparams;
+ context.defineParams = defineParams;
+
+ all_found = determine_param_types(node->scan.plan.lefttree,
+ &context);
+ /*
+ * Remove not defined params from the list of remote params.
+ * If they are not referenced no need to send them down
+ */
+ if (!all_found)
+ {
+ for (i = 0; i < rstmt.nParamRemote; i++)
+ {
+ if (rstmt.remoteparams[i].paramkind == PARAM_EXEC &&
+ bms_is_member(rstmt.remoteparams[i].paramid,
+ context.defineParams))
+ {
+ /* Copy last parameter inplace */
+ rstmt.nParamRemote--;
+ if (i < rstmt.nParamRemote)
+ rstmt.remoteparams[i] =
+ rstmt.remoteparams[rstmt.nParamRemote];
+ /* keep current in the same position */
+ i--;
+ }
+ }
+ }
+ bms_free(context.defineParams);
+ }
+ }
+ remotestate->nParamRemote = rstmt.nParamRemote;
+ remotestate->remoteparams = rstmt.remoteparams;
+ }
+ else
+ rstmt.remoteparams = NULL;
+ rstmt.rowMarks = estate->es_plannedstmt->rowMarks;
+ rstmt.distributionKey = node->distributionKey;
+ rstmt.distributionType = node->distributionType;
+ rstmt.distributionNodes = node->distributionNodes;
+ rstmt.distributionRestrict = node->distributionRestrict;
+
+ set_portable_output(true);
+ remotestate->subplanstr = nodeToString(&rstmt);
+ set_portable_output(false);
+
+ /*
+ * Connect to remote nodes and send down subplan
+ */
+ if (!(eflags & EXEC_FLAG_SUBPLAN))
+ ExecFinishInitRemoteSubplan(remotestate);
+ }
+ remotestate->bound = false;
+ /*
+ * It does not makes sense to merge sort if there is only one tuple source.
+ * By the contract it is already sorted
+ */
+ if (node->sort && remotestate->execOnAll &&
+ list_length(remotestate->execNodes) > 1)
+ combiner->merge_sort = true;
+
+ if (log_remotesubplan_stats)
+ ShowUsageCommon("ExecInitRemoteSubplan", &start_r, &start_t);
+
+ return remotestate;
+}
+
+
+void
+ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ EState *estate = combiner->ss.ps.state;
+ Oid *paramtypes = NULL;
+ GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ Snapshot snapshot;
+ TimestampTz timestamp;
+ int i;
+ bool is_read_only;
+ char cursor[NAMEDATALEN];
+
+ /*
+ * Name is required to store plan as a statement
+ */
+ Assert(plan->cursor);
+
+ if (plan->unique)
+ snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ else
+ strncpy(cursor, plan->cursor, NAMEDATALEN);
+
+ /* If it is alreaty fully initialized nothing to do */
+ if (combiner->connections)
+ return;
+
+ /* local only or explain only execution */
+ if (node->subplanstr == NULL)
+ return;
+
+ /*
+ * Check if any results are planned to be received here.
+ * Otherwise it does not make sense to send out the subplan.
+ */
+ if (IS_PGXC_DATANODE && plan->distributionRestrict &&
+ !list_member_int(plan->distributionRestrict, PGXCNodeId - 1))
+ return;
+
+ /*
+ * Acquire connections and send down subplan where it will be stored
+ * as a prepared statement.
+ * That does not require transaction id or snapshot, so does not send them
+ * here, postpone till bind.
+ */
+ if (node->execOnAll)
+ {
+ PGXCNodeAllHandles *pgxc_connections;
+ pgxc_connections = get_handles(node->execNodes, NIL, false, true);
+ combiner->conn_count = pgxc_connections->dn_conn_count;
+ combiner->connections = pgxc_connections->datanode_handles;
+ combiner->current_conn = 0;
+ pfree(pgxc_connections);
+ }
+ else
+ {
+ combiner->connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
+ combiner->connections[0] = get_any_handle(node->execNodes);
+ combiner->conn_count = 1;
+ combiner->current_conn = 0;
+ }
+
+ gxid = GetCurrentTransactionIdIfAny();
+
+ /* extract parameter data types */
+ if (node->nParamRemote > 0)
+ {
+ paramtypes = (Oid *) palloc(node->nParamRemote * sizeof(Oid));
+ for (i = 0; i < node->nParamRemote; i++)
+ paramtypes[i] = node->remoteparams[i].paramtype;
+ }
+ /* send down subplan */
+ snapshot = GetActiveSnapshot();
+ timestamp = GetCurrentGTMStartTimestamp();
+ /*
+ * Datanode should not send down statements that may modify
+ * the database. Potgres assumes that all sessions under the same
+ * postmaster have different xids. That may cause a locking problem.
+ * Shared locks acquired for reading still work fine.
+ */
+ is_read_only = IS_PGXC_DATANODE ||
+ !IsA(outerPlan(plan), ModifyTable);
+
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *connection = combiner->connections[i];
+
+ if (pgxc_node_begin(1, &connection, gxid, true,
+ is_read_only, PGXC_NODE_DATANODE))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data node.")));
+
+ if (pgxc_node_send_timestamp(connection, timestamp))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+ if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send snapshot to data nodes")));
+ }
+ if (pgxc_node_send_cmd_id(connection, estate->es_snapshot->curcid) < 0 )
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command ID to data nodes")));
+ }
+ pgxc_node_send_plan(connection, cursor, "Remote Subplan",
+ node->subplanstr, node->nParamRemote, paramtypes);
+ if (pgxc_node_flush(connection))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send subplan to data nodes")));
+ }
+ }
+}
+
+
+static void
+append_param_data(StringInfo buf, Oid ptype, int pused, Datum value, bool isnull)
+{
+ uint32 n32;
+
+ /* Assume unused parameters to have null values */
+ if (!pused)
+ ptype = INT4OID;
+
+ if (isnull)
+ {
+ n32 = htonl(-1);
+ appendBinaryStringInfo(buf, (char *) &n32, 4);
+ }
+ else
+ {
+ Oid typOutput;
+ bool typIsVarlena;
+ Datum pval;
+ char *pstring;
+ int len;
+
+ /* Get info needed to output the value */
+ getTypeOutputInfo(ptype, &typOutput, &typIsVarlena);
+
+ /*
+ * If we have a toasted datum, forcibly detoast it here to avoid
+ * memory leakage inside the type's output routine.
+ */
+ if (typIsVarlena)
+ pval = PointerGetDatum(PG_DETOAST_DATUM(value));
+ else
+ pval = value;
+
+ /* Convert Datum to string */
+ pstring = OidOutputFunctionCall(typOutput, pval);
+
+ /* copy data to the buffer */
+ len = strlen(pstring);
+ n32 = htonl(len);
+ appendBinaryStringInfo(buf, (char *) &n32, 4);
+ appendBinaryStringInfo(buf, pstring, len);
+ }
+}
+
+
+static int encode_parameters(int nparams, RemoteParam *remoteparams,
+ PlanState *planstate, char** result)
+{
+ EState *estate = planstate->state;
+ StringInfoData buf;
+ uint16 n16;
+ int i;
+ ExprContext *econtext;
+ MemoryContext oldcontext;
+
+ if (planstate->ps_ExprContext == NULL)
+ ExecAssignExprContext(estate, planstate);
+
+ econtext = planstate->ps_ExprContext;
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+ MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+ initStringInfo(&buf);
+
+ /* Number of parameter values */
+ n16 = htons(nparams);
+ appendBinaryStringInfo(&buf, (char *) &n16, 2);
+
+ /* Parameter values */
+ for (i = 0; i < nparams; i++)
+ {
+ RemoteParam *rparam = &remoteparams[i];
+ int ptype = rparam->paramtype;
+ int pused = rparam->paramused;
+ if (rparam->paramkind == PARAM_EXTERN)
+ {
+ ParamExternData *param;
+ param = &(estate->es_param_list_info->params[rparam->paramid - 1]);
+ append_param_data(&buf, ptype, pused, param->value, param->isnull);
+ }
+ else
+ {
+ ParamExecData *param;
+ param = &(estate->es_param_exec_vals[rparam->paramid]);
+ if (param->execPlan)
+ {
+ /* Parameter not evaluated yet, so go do it */
+ ExecSetParamPlan((SubPlanState *) param->execPlan,
+ planstate->ps_ExprContext);
+ /* ExecSetParamPlan should have processed this param... */
+ Assert(param->execPlan == NULL);
+ }
+ if (!param->done)
+ param->isnull = true;
+ append_param_data(&buf, ptype, pused, param->value, param->isnull);
+
+ }
+ }
+
+ /* Take data from the buffer */
+ *result = palloc(buf.len);
+ memcpy(*result, buf.data, buf.len);
+ MemoryContextSwitchTo(oldcontext);
+ return buf.len;
+}
+
+
+TupleTableSlot *
+ExecRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ EState *estate = combiner->ss.ps.state;
+ TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
+ struct rusage start_r;
+ struct timeval start_t;
+
+ /*
+ * We allow combiner->conn_count == 0 after node initialization
+ * if we figured out that current node won't receive any result
+ * because of distributionRestrict is set by planner.
+ * But we should distinguish this case from others, when conn_count is 0.
+ * That is possible if local execution is chosen or data are buffered
+ * at the coordinator or data are exhausted and node was reset.
+ * in last two cases connections are saved to cursor_connections and we
+ * can check their presence.
+ */
+ if (!node->local_exec && combiner->conn_count == 0 &&
+ combiner->cursor_count == 0)
+ return NULL;
+
+ if (log_remotesubplan_stats)
+ ResetUsageCommon(&start_r, &start_t);
+
+primary_mode_phase_two:
+ if (!node->bound)
+ {
+ int fetch = 0;
+ int paramlen = 0;
+ char *paramdata = NULL;
+ /*
+ * Conditions when we want to execute query on the primary node first:
+ * Coordinator running replicated ModifyTable on multiple nodes
+ */
+ bool primary_mode = combiner->probing_primary ||
+ (IS_PGXC_COORDINATOR &&
+ combiner->combine_type == COMBINE_TYPE_SAME &&
+ OidIsValid(primary_data_node) &&
+ combiner->conn_count > 1);
+ char cursor[NAMEDATALEN];
+
+ if (plan->cursor)
+ {
+ fetch = PGXLRemoteFetchSize;
+ if (plan->unique)
+ snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ else
+ strncpy(cursor, plan->cursor, NAMEDATALEN);
+ }
+ else
+ cursor[0] = '\0';
+
+ /*
+ * Send down all available parameters, if any is used by the plan
+ */
+ if (estate->es_param_list_info ||
+ !bms_is_empty(plan->scan.plan.allParam))
+ paramlen = encode_parameters(node->nParamRemote,
+ node->remoteparams,
+ &combiner->ss.ps,
+ ¶mdata);
+
+ /*
+ * The subplan being rescanned, need to restore connections and
+ * re-bind the portal
+ */
+ if (combiner->cursor)
+ {
+ int i;
+
+ /*
+ * On second phase of primary mode connections are properly set,
+ * so do not copy.
+ */
+ if (!combiner->probing_primary)
+ {
+ combiner->conn_count = combiner->cursor_count;
+ memcpy(combiner->connections, combiner->cursor_connections,
+ combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ }
+
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *conn = combiner->connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ /* close previous cursor only on phase 1 */
+ if (!primary_mode || !combiner->probing_primary)
+ pgxc_node_send_close(conn, false, combiner->cursor);
+
+ /*
+ * If we now should probe primary, skip execution on non-primary
+ * nodes
+ */
+ if (primary_mode && !combiner->probing_primary &&
+ conn->nodeoid != primary_data_node)
+ continue;
+
+ /* rebind */
+ pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor,
+ paramlen, paramdata);
+ /* execute */
+ pgxc_node_send_execute(conn, combiner->cursor, fetch);
+ /* submit */
+ if (pgxc_node_send_flush(conn))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+
+ /*
+ * There could be only one primary node, but can not leave the
+ * loop now, because we need to close cursors.
+ */
+ if (primary_mode && !combiner->probing_primary)
+ {
+ combiner->current_conn = i;
+ }
+ }
+ }
+ else if (node->execNodes)
+ {
+ CommandId cid;
+ int i;
+
+ /*
+ * There are prepared statement, connections should be already here
+ */
+ Assert(combiner->conn_count > 0);
+
+ combiner->extended_query = true;
+ cid = estate->es_snapshot->curcid;
+
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *conn = combiner->connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ /*
+ * If we now should probe primary, skip execution on non-primary
+ * nodes
+ */
+ if (primary_mode && !combiner->probing_primary &&
+ conn->nodeoid != primary_data_node)
+ continue;
+
+ /*
+ * Update Command Id. Other command may be executed after we
+ * prepare and advanced Command Id. We should use one that
+ * was active at the moment when command started.
+ */
+ if (pgxc_node_send_cmd_id(conn, cid))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command ID to data nodes")));
+ }
+
+ /*
+ * Resend the snapshot as well since the connection may have
+ * been buffered and use by other commands, with different
+ * snapshot. Set the snapshot back to what it was
+ */
+ if (pgxc_node_send_snapshot(conn, estate->es_snapshot))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send snapshot to data nodes")));
+ }
+
+ /* bind */
+ pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata);
+ /* execute */
+ pgxc_node_send_execute(conn, cursor, fetch);
+ /* submit */
+ if (pgxc_node_send_flush(conn))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+
+ /*
+ * There could be only one primary node, so if we executed
+ * subquery on the phase one of primary mode we can leave the
+ * loop now.
+ */
+ if (primary_mode && !combiner->probing_primary)
+ {
+ combiner->current_conn = i;
+ break;
+ }
+ }
+
+ /*
+ * On second phase of primary mode connections are backed up
+ * already, so do not copy.
+ */
+ if (primary_mode)
+ {
+ if (combiner->probing_primary)
+ {
+ combiner->cursor = pstrdup(cursor);
+ }
+ else
+ {
+ combiner->cursor = pstrdup(cursor);
+ combiner->cursor_count = combiner->conn_count;
+ combiner->cursor_connections = (PGXCNodeHandle **) palloc(
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ memcpy(combiner->cursor_connections, combiner->connections,
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ }
+ }
+ else
+ {
+ combiner->cursor = pstrdup(cursor);
+ combiner->cursor_count = combiner->conn_count;
+ combiner->cursor_connections = (PGXCNodeHandle **) palloc(
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ memcpy(combiner->cursor_connections, combiner->connections,
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ }
+ }
+
+ if (combiner->merge_sort)
+ {
+ /*
+ * Requests are already made and sorter can fetch tuples to populate
+ * sort buffer.
+ */
+ combiner->tuplesortstate = tuplesort_begin_merge(
+ resultslot->tts_tupleDescriptor,
+ plan->sort->numCols,
+ plan->sort->sortColIdx,
+ plan->sort->sortOperators,
+ plan->sort->sortCollations,
+ plan->sort->nullsFirst,
+ combiner,
+ work_mem);
+ }
+ if (primary_mode)
+ {
+ if (combiner->probing_primary)
+ {
+ combiner->probing_primary = false;
+ node->bound = true;
+ }
+ else
+ combiner->probing_primary = true;
+ }
+ else
+ node->bound = true;
+ }
+
+ if (combiner->tuplesortstate)
+ {
+ if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
++ true, true, resultslot, NULL))
+ {
+ if (log_remotesubplan_stats)
+ ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
+ return resultslot;
+ }
+ }
+ else
+ {
+ TupleTableSlot *slot = FetchTuple(combiner);
+ if (!TupIsNull(slot))
+ {
+ if (log_remotesubplan_stats)
+ ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
+ return slot;
+ }
+ else if (combiner->probing_primary)
+ /* phase1 is successfully completed, run on other nodes */
+ goto primary_mode_phase_two;
+ }
+ if (combiner->errorMessage)
+ pgxc_node_report_error(combiner);
+
+ if (log_remotesubplan_stats)
+ ShowUsageCommon("ExecRemoteSubplan", &start_r, &start_t);
+
+ return NULL;
+}
+
+
+void
+ExecReScanRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *)node;
+
+ /*
+ * If we haven't queried remote nodes yet, just return. If outerplan'
+ * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+ * else - no reason to re-scan it at all.
+ */
+ if (!node->bound)
+ return;
+
+ /*
+ * If we execute locally rescan local copy of the plan
+ */
+ if (outerPlanState(node))
+ ExecReScan(outerPlanState(node));
+
+ /*
+ * Consume any possible pending input
+ */
+ pgxc_connections_cleanup(combiner);
+
+ /* misc cleanup */
+ combiner->command_complete_count = 0;
+ combiner->description_count = 0;
+
+ /*
+ * Force query is re-bound with new parameters
+ */
+ node->bound = false;
+}
+
+
+void
+ExecEndRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *)node;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ int i;
+ struct rusage start_r;
+ struct timeval start_t;
+
+ if (log_remotesubplan_stats)
+ ResetUsageCommon(&start_r, &start_t);
+
+ if (outerPlanState(node))
+ ExecEndNode(outerPlanState(node));
+ if (node->locator)
+ freeLocator(node->locator);
+
+ /*
+ * Consume any possible pending input
+ */
+ if (node->bound)
+ pgxc_connections_cleanup(combiner);
+
+ /*
+ * Update coordinator statistics
+ */
+ if (IS_PGXC_COORDINATOR)
+ {
+ EState *estate = combiner->ss.ps.state;
+
+ if (estate->es_num_result_relations > 0 && estate->es_processed > 0)
+ {
+ switch (estate->es_plannedstmt->commandType)
+ {
+ case CMD_INSERT:
+ /* One statement can insert into only one relation */
+ pgstat_count_remote_insert(
+ estate->es_result_relations[0].ri_RelationDesc,
+ estate->es_processed);
+ break;
+ case CMD_UPDATE:
+ case CMD_DELETE:
+ {
+ /*
+ * We can not determine here how many row were updated
+ * or delete in each table, so assume same number of
+ * affected row in each table.
+ * If resulting number of rows is 0 because of rounding,
+ * increment each counter at least on 1.
+ */
+ int i;
+ int n;
+ bool update;
+
+ update = (estate->es_plannedstmt->commandType == CMD_UPDATE);
+ n = estate->es_processed / estate->es_num_result_relations;
+ if (n == 0)
+ n = 1;
+ for (i = 0; i < estate->es_num_result_relations; i++)
+ {
+ Relation r;
+ r = estate->es_result_relations[i].ri_RelationDesc;
+ if (update)
+ pgstat_count_remote_update(r, n);
+ else
+ pgstat_count_remote_delete(r, n);
+ }
+ }
+ break;
+ default:
+ /* nothing to count */
+ break;
+ }
+ }
+ }
+
+ /*
+ * Close portals. While cursors_connections exist there are open portals
+ */
+ if (combiner->cursor)
+ {
+ /* Restore connections where there are active statements */
+ combiner->conn_count = combiner->cursor_count;
+ memcpy(combiner->connections, combiner->cursor_connections,
+ combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ for (i = 0; i < combiner->cursor_count; i++)
+ {
+ PGXCNodeHandle *conn;
+
+ conn = combiner->cursor_connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ if (pgxc_node_send_close(conn, false, combiner->cursor) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close data node cursor")));
+ }
+ /* The cursor stuff is not needed */
+ combiner->cursor = NULL;
+ combiner->cursor_count = 0;
+ pfree(combiner->cursor_connections);
+ combiner->cursor_connections = NULL;
+ }
+
+ /* Close statements, even if they never were bound */
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *conn;
+ char cursor[NAMEDATALEN];
+
+ if (plan->cursor)
+ {
+ if (plan->unique)
+ snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ else
+ strncpy(cursor, plan->cursor, NAMEDATALEN);
+ }
+ else
+ cursor[0] = '\0';
+
+ conn = combiner->connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ if (pgxc_node_send_close(conn, true, cursor) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close data node statement")));
+ /* Send SYNC and wait for ReadyForQuery */
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to synchronize data node")));
+ /*
+ * Formally connection is not in QUERY state, we set the state to read
+ * CloseDone and ReadyForQuery responses. Upon receiving ReadyForQuery
+ * state will be changed back to IDLE and conn->coordinator will be
+ * cleared.
+ */
+ PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_CLOSE);
+ }
+
+ while (combiner->conn_count > 0)
+ {
+ if (pgxc_node_receive(combiner->conn_count,
+ combiner->connections, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close remote subplan")));
+ i = 0;
+ while (i < combiner->conn_count)
+ {
+ int res = handle_response(combiner->connections[i], combiner);
+ if (res == RESPONSE_EOF)
+ {
+ i++;
+ }
+ else if (res == RESPONSE_READY)
+ {
+ /* Done, connection is reade for query */
+ if (--combiner->conn_count > i)
+ combiner->connections[i] =
+ combiner->connections[combiner->conn_count];
+ }
+ else if (res == RESPONSE_DATAROW)
+ {
+ /*
+ * If we are finishing slowly running remote subplan while it
+ * is still working (because of Limit, for example) it may
+ * produce one or more tuples between connection cleanup and
+ * handling Close command. One tuple does not cause any problem,
+ * but if it will not be read the next tuple will trigger
+ * assertion failure. So if we got a tuple, just read and
+ * discard it here.
+ */
+ pfree(combiner->currentRow);
+ combiner->currentRow = NULL;
+ }
+ /* Ignore other possible responses */
+ }
+ }
+
+ ValidateAndCloseCombiner(combiner);
+ pfree(node);
+
+ if (log_remotesubplan_stats)
+ ShowUsageCommon("ExecEndRemoteSubplan", &start_r, &start_t);
+}
+
+/*
+ * pgxc_node_report_error
+ * Throw error from Datanode if any.
+ */
+static void
+pgxc_node_report_error(ResponseCombiner *combiner)
+{
+ /* If no combiner, nothing to do */
+ if (!combiner)
+ return;
+ if (combiner->errorMessage)
+ {
+ char *code = combiner->errorCode;
+ if ((combiner->errorDetail == NULL) && (combiner->errorHint == NULL))
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage)));
+ else if ((combiner->errorDetail != NULL) && (combiner->errorHint != NULL))
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage),
+ errdetail("%s", combiner->errorDetail),
+ errhint("%s", combiner->errorHint)));
+ else if (combiner->errorDetail != NULL)
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage),
+ errdetail("%s", combiner->errorDetail)));
+ else
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage),
+ errhint("%s", combiner->errorHint)));
+ }
+}
+
+
+/*
+ * get_success_nodes:
+ * Currently called to print a user-friendly message about
+ * which nodes the query failed.
+ * Gets all the nodes where no 'E' (error) messages were received; i.e. where the
+ * query ran successfully.
+ */
+static ExecNodes *
+get_success_nodes(int node_count, PGXCNodeHandle **handles, char node_type, StringInfo failednodes)
+{
+ ExecNodes *success_nodes = NULL;
+ int i;
+
+ for (i = 0; i < node_count; i++)
+ {
+ PGXCNodeHandle *handle = handles[i];
+ int nodenum = PGXCNodeGetNodeId(handle->nodeoid, &node_type);
+
+ if (!handle->error)
+ {
+ if (!success_nodes)
+ success_nodes = makeNode(ExecNodes);
+ success_nodes->nodeList = lappend_int(success_nodes->nodeList, nodenum);
+ }
+ else
+ {
+ if (failednodes->len == 0)
+ appendStringInfo(failednodes, "Error message received from nodes:");
+ appendStringInfo(failednodes, " %s#%d",
+ (node_type == PGXC_NODE_COORDINATOR ? "coordinator" : "datanode"),
+ nodenum + 1);
+ }
+ }
+ return success_nodes;
+}
+
+/*
+ * pgxc_all_success_nodes: Uses get_success_nodes() to collect the
+ * user-friendly message from coordinator as well as datanode.
+ */
+void
+pgxc_all_success_nodes(ExecNodes **d_nodes, ExecNodes **c_nodes, char **failednodes_msg)
+{
+ PGXCNodeAllHandles *connections = get_exec_connections(NULL, NULL, EXEC_ON_ALL_NODES, true);
+ StringInfoData failednodes;
+ initStringInfo(&failednodes);
+
+ *d_nodes = get_success_nodes(connections->dn_conn_count,
+ connections->datanode_handles,
+ PGXC_NODE_DATANODE,
+ &failednodes);
+
+ *c_nodes = get_success_nodes(connections->co_conn_count,
+ connections->coord_handles,
+ PGXC_NODE_COORDINATOR,
+ &failednodes);
+
+ if (failednodes.len == 0)
+ *failednodes_msg = NULL;
+ else
+ *failednodes_msg = failednodes.data;
+
+ pfree_pgxc_all_handles(connections);
+}
+
+
+/*
+ * set_dbcleanup_callback:
+ * Register a callback function which does some non-critical cleanup tasks
+ * on xact success or abort, such as tablespace/database directory cleanup.
+ */
+void set_dbcleanup_callback(xact_callback function, void *paraminfo, int paraminfo_size)
+{
+ void *fparams;
+
+ fparams = MemoryContextAlloc(TopMemoryContext, paraminfo_size);
+ memcpy(fparams, paraminfo, paraminfo_size);
+
+ dbcleanup_info.function = function;
+ dbcleanup_info.fparams = fparams;
+}
+
+/*
+ * AtEOXact_DBCleanup: To be called at post-commit or pre-abort.
+ * Calls the cleanup function registered during this transaction, if any.
+ */
+void AtEOXact_DBCleanup(bool isCommit)
+{
+ if (dbcleanup_info.function)
+ (*dbcleanup_info.function)(isCommit, dbcleanup_info.fparams);
+
+ /*
+ * Just reset the callbackinfo. We anyway don't want this to be called again,
+ * until explicitly set.
+ */
+ dbcleanup_info.function = NULL;
+ if (dbcleanup_info.fparams)
+ {
+ pfree(dbcleanup_info.fparams);
+ dbcleanup_info.fparams = NULL;
+ }
+}
+
+char *
+GetImplicit2PCGID(const char *implicit2PC_head, bool localWrite)
+{
+ int dnCount = 0, coordCount = 0;
+ int dnNodeIds[MaxDataNodes];
+ int coordNodeIds[MaxCoords];
+ MemoryContext oldContext = CurrentMemoryContext;
+ StringInfoData str;
+ int i;
+
+ oldContext = MemoryContextSwitchTo(TopTransactionContext);
+ initStringInfo(&str);
+ /*
+ * Check how many coordinators and datanodes are involved in this
+ * transaction
+ */
+ pgxc_node_remote_count(&dnCount, dnNodeIds, &coordCount, coordNodeIds);
+ appendStringInfo(&str, "%s%u:%s:%c:%d:%d",
+ implicit2PC_head,
+ GetTopTransactionId(),
+ PGXCNodeName,
+ localWrite ? 'T' : 'F',
+ dnCount,
+ coordCount + (localWrite ? 1 : 0));
+
+ for (i = 0; i < dnCount; i++)
+ appendStringInfo(&str, ":%d", dnNodeIds[i]);
+ for (i = 0; i < coordCount; i++)
+ appendStringInfo(&str, ":%d", coordNodeIds[i]);
+
+ if (localWrite)
+ appendStringInfo(&str, ":%d", PGXCNodeIdentifier);
+
+ MemoryContextSwitchTo(oldContext);
+
+ return str.data;
+}
--- /dev/null
- PG_RETURN_NAME(PGXCNodeName);
+/*-------------------------------------------------------------------------
+ *
+ * pgxcnode.c
+ *
+ * Functions for the Coordinator communicating with the PGXC nodes:
+ * Datanodes and Coordinators
+ *
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include <poll.h>
+
+#ifdef __sun
+#include <sys/filio.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "access/gtm.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/htup_details.h"
+#include "catalog/pg_type.h"
+#include "commands/prepare.h"
+#include "gtm/gtm_c.h"
+#include "nodes/nodes.h"
+#include "pgxc/pgxcnode.h"
+#include "pgxc/execRemote.h"
+#include "catalog/pgxc_node.h"
+#include "catalog/pg_collation.h"
+#include "pgxc/locator.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/poolmgr.h"
+#include "tcop/dest.h"
++#include "storage/lwlock.h"
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "utils/fmgroids.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/formatting.h"
+#include "utils/tqual.h"
+#include "../interfaces/libpq/libpq-fe.h"
+#ifdef XCP
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "pgxc/pause.h"
+#include "utils/snapmgr.h"
+#endif
+
+#define CMD_ID_MSG_LEN 8
+
+/* Number of connections held */
+static int datanode_count = 0;
+static int coord_count = 0;
+
+/*
+ * Datanode handles saved in Transaction memory context
+ * when PostgresMain is launched.
+ * Those handles are used inside a transaction by Coordinator to Datanodes.
+ */
+static PGXCNodeHandle *dn_handles = NULL;
+
+/*
+ * Coordinator handles saved in Transaction memory context
+ * when PostgresMain is launched.
+ * Those handles are used inside a transaction by Coordinator to Coordinators
+ */
+static PGXCNodeHandle *co_handles = NULL;
+
+/* Current size of dn_handles and co_handles */
+int NumDataNodes;
+int NumCoords;
+
+
+#ifdef XCP
+volatile bool HandlesInvalidatePending = false;
+volatile bool HandlesRefreshPending = false;
+
+/*
+ * Session and transaction parameters need to to be set on newly connected
+ * remote nodes.
+ */
+static List *session_param_list = NIL;
+static List *local_param_list = NIL;
+static StringInfo session_params;
+static StringInfo local_params;
+
+typedef struct
+{
+ NameData name;
+ NameData value;
+ int flags;
+} ParamEntry;
+
+
+static bool DoInvalidateRemoteHandles(void);
+static bool DoRefreshRemoteHandles(void);
+#endif
+
+#ifdef XCP
+static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
+ bool global_session, int pid);
+#else
+static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
+#endif
+static void pgxc_node_free(PGXCNodeHandle *handle);
+static void pgxc_node_all_free(void);
+
+static int get_int(PGXCNodeHandle * conn, size_t len, int *out);
+static int get_char(PGXCNodeHandle * conn, char *out);
+
+
+/*
+ * Initialize PGXCNodeHandle struct
+ */
+static void
+init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
+{
+ /*
+ * Socket descriptor is small non-negative integer,
+ * Indicate the handle is not initialized yet
+ */
+ pgxc_handle->sock = NO_SOCKET;
+
+ /* Initialise buffers */
+ pgxc_handle->error = NULL;
+ pgxc_handle->outSize = 16 * 1024;
+ pgxc_handle->outBuffer = (char *) palloc(pgxc_handle->outSize);
+ pgxc_handle->inSize = 16 * 1024;
+
+ pgxc_handle->inBuffer = (char *) palloc(pgxc_handle->inSize);
+ pgxc_handle->combiner = NULL;
+ pgxc_handle->inStart = 0;
+ pgxc_handle->inEnd = 0;
+ pgxc_handle->inCursor = 0;
+ pgxc_handle->outEnd = 0;
+ pgxc_handle->needSync = false;
+
+ if (pgxc_handle->outBuffer == NULL || pgxc_handle->inBuffer == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+}
+
+
+/*
+ * Allocate and initialize memory to store Datanode and Coordinator handles.
+ */
+void
+InitMultinodeExecutor(bool is_force)
+{
+ int count;
+ Oid *coOids, *dnOids;
+#ifdef XCP
+ MemoryContext oldcontext;
+#endif
+
+
+ /* Free all the existing information first */
+ if (is_force)
+ pgxc_node_all_free();
+
+ /* This function could get called multiple times because of sigjmp */
+ if (dn_handles != NULL &&
+ co_handles != NULL)
+ return;
+
+ /* Update node table in the shared memory */
+ PgxcNodeListAndCount();
+
+ /* Get classified list of node Oids */
+ PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true);
+
+#ifdef XCP
+ /*
+ * Coordinator and datanode handles should be available during all the
+ * session lifetime
+ */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+#endif
+
+ /* Do proper initialization of handles */
+ if (NumDataNodes > 0)
+ dn_handles = (PGXCNodeHandle *)
+ palloc(NumDataNodes * sizeof(PGXCNodeHandle));
+ if (NumCoords > 0)
+ co_handles = (PGXCNodeHandle *)
+ palloc(NumCoords * sizeof(PGXCNodeHandle));
+
+ if ((!dn_handles && NumDataNodes > 0) ||
+ (!co_handles && NumCoords > 0))
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory for node handles")));
+
+ /* Initialize new empty slots */
+ for (count = 0; count < NumDataNodes; count++)
+ {
+ init_pgxc_handle(&dn_handles[count]);
+ dn_handles[count].nodeoid = dnOids[count];
+ dn_handles[count].nodeid = get_pgxc_node_id(dnOids[count]);
+ strncpy(dn_handles[count].nodename, get_pgxc_nodename(dnOids[count]),
+ NAMEDATALEN);
+ strncpy(dn_handles[count].nodehost, get_pgxc_nodehost(dnOids[count]),
+ NAMEDATALEN);
+ dn_handles[count].nodeport = get_pgxc_nodeport(dnOids[count]);
+ }
+ for (count = 0; count < NumCoords; count++)
+ {
+ init_pgxc_handle(&co_handles[count]);
+ co_handles[count].nodeoid = coOids[count];
+ co_handles[count].nodeid = get_pgxc_node_id(coOids[count]);
+ strncpy(co_handles[count].nodename, get_pgxc_nodename(coOids[count]),
+ NAMEDATALEN);
+ strncpy(co_handles[count].nodehost, get_pgxc_nodehost(coOids[count]),
+ NAMEDATALEN);
+ co_handles[count].nodeport = get_pgxc_nodeport(coOids[count]);
+ }
+
+ datanode_count = 0;
+ coord_count = 0;
+ PGXCNodeId = 0;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ if (IS_PGXC_COORDINATOR)
+ {
+ for (count = 0; count < NumCoords; count++)
+ {
+ if (pg_strcasecmp(PGXCNodeName,
+ get_pgxc_nodename(co_handles[count].nodeoid)) == 0)
+ PGXCNodeId = count + 1;
+ }
+ }
+ else /* DataNode */
+ {
+ for (count = 0; count < NumDataNodes; count++)
+ {
+ if (pg_strcasecmp(PGXCNodeName,
+ get_pgxc_nodename(dn_handles[count].nodeoid)) == 0)
+ PGXCNodeId = count + 1;
+ }
+ }
+}
+
+/*
+ * Builds up a connection string
+ */
+char *
+PGXCNodeConnStr(char *host, int port, char *dbname,
+ char *user, char *pgoptions, char *remote_type, char *parent_node)
+{
+ char *out,
+ connstr[1024];
+ int num;
+
+ /*
+ * Build up connection string
+ * remote type can be Coordinator, Datanode or application.
+ */
+ num = snprintf(connstr, sizeof(connstr),
+ "host=%s port=%d dbname=%s user=%s application_name='pgxc:%s' sslmode=disable options='-c remotetype=%s -c parentnode=%s %s'",
+ host, port, dbname, user, parent_node, remote_type, parent_node,
+ pgoptions);
+
+ /* Check for overflow */
+ if (num > 0 && num < sizeof(connstr))
+ {
+ /* Output result */
+ out = (char *) palloc(num + 1);
+ strcpy(out, connstr);
+ return out;
+ }
+
+ /* return NULL if we have problem */
+ return NULL;
+}
+
+
+/*
+ * Connect to a Datanode using a connection string
+ */
+NODE_CONNECTION *
+PGXCNodeConnect(char *connstr)
+{
+ PGconn *conn;
+
+ /* Delegate call to the pglib */
+ conn = PQconnectdb(connstr);
+ return (NODE_CONNECTION *) conn;
+}
+
+int PGXCNodePing(const char *connstr)
+{
+ if (connstr[0])
+ {
+ PGPing status = PQping(connstr);
+ if (status == PQPING_OK)
+ return 0;
+ else
+ return 1;
+ }
+ else
+ return -1;
+}
+
+/*
+ * Close specified connection
+ */
+void
+PGXCNodeClose(NODE_CONNECTION *conn)
+{
+ /* Delegate call to the pglib */
+ PQfinish((PGconn *) conn);
+}
+
+/*
+ * Checks if connection active
+ */
+int
+PGXCNodeConnected(NODE_CONNECTION *conn)
+{
+ /* Delegate call to the pglib */
+ PGconn *pgconn = (PGconn *) conn;
+
+ /*
+ * Simple check, want to do more comprehencive -
+ * check if it is ready for guery
+ */
+ return pgconn && PQstatus(pgconn) == CONNECTION_OK;
+}
+
+
+
+/* Close the socket handle (this process' copy) and free occupied memory
+ *
+ * Note that we do not free the handle and its members. This will be
+ * taken care of when the transaction ends, when TopTransactionContext
+ * is destroyed in xact.c.
+ */
+static void
+pgxc_node_free(PGXCNodeHandle *handle)
+{
+ if (handle->sock != NO_SOCKET)
+ close(handle->sock);
+ handle->sock = NO_SOCKET;
+}
+
+/*
+ * Free all the node handles cached
+ */
+static void
+pgxc_node_all_free(void)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ {
+ int num_nodes = 0;
+ PGXCNodeHandle *array_handles;
+
+ switch (i)
+ {
+ case 0:
+ num_nodes = NumCoords;
+ array_handles = co_handles;
+ break;
+ case 1:
+ num_nodes = NumDataNodes;
+ array_handles = dn_handles;
+ break;
+ default:
+ Assert(0);
+ }
+
+ for (j = 0; j < num_nodes; j++)
+ {
+ PGXCNodeHandle *handle = &array_handles[j];
+ pgxc_node_free(handle);
+ }
+ if (array_handles)
+ pfree(array_handles);
+ }
+
+ co_handles = NULL;
+ dn_handles = NULL;
+ HandlesInvalidatePending = false;
+ HandlesRefreshPending = false;
+}
+
+/*
+ * Create and initialise internal structure to communicate to
+ * Datanode via supplied socket descriptor.
+ * Structure stores state info and I/O buffers
+ */
+static void
+pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
+{
+ char *init_str;
+
+ handle->sock = sock;
+ handle->backend_pid = pid;
+ handle->transaction_status = 'I';
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_IDLE);
+ handle->read_only = true;
+ handle->ck_resp_rollback = false;
+ handle->combiner = NULL;
+#ifdef DN_CONNECTION_DEBUG
+ handle->have_row_desc = false;
+#endif
+ handle->error = NULL;
+ handle->outEnd = 0;
+ handle->inStart = 0;
+ handle->inEnd = 0;
+ handle->inCursor = 0;
+ handle->needSync = false;
+ /*
+ * We got a new connection, set on the remote node the session parameters
+ * if defined. The transaction parameter should be sent after BEGIN
+ */
+ if (global_session)
+ {
+ init_str = PGXCNodeGetSessionParamStr();
+ if (init_str)
+ {
+ pgxc_node_set_query(handle, init_str);
+ }
+ }
+}
+
+
+/*
+ * Wait while at least one of specified connections has data available and read
+ * the data into the buffer
+ */
+bool
+pgxc_node_receive(const int conn_count,
+ PGXCNodeHandle ** connections, struct timeval * timeout)
+{
+#define ERROR_OCCURED true
+#define NO_ERROR_OCCURED false
+ int i,
+ sockets_to_poll,
+ poll_val;
+ bool is_msg_buffered;
+ long timeout_ms;
+ struct pollfd pool_fd[conn_count];
+
+ /* sockets to be polled index */
+ sockets_to_poll = 0;
+
+ is_msg_buffered = false;
+ for (i = 0; i < conn_count; i++)
+ {
+ /* If connection has a buffered message */
+ if (HAS_MESSAGE_BUFFERED(connections[i]))
+ {
+ is_msg_buffered = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < conn_count; i++)
+ {
+ /* If connection finished sending do not wait input from it */
+ if (connections[i]->state == DN_CONNECTION_STATE_IDLE || HAS_MESSAGE_BUFFERED(connections[i]))
+ {
+ pool_fd[i].fd = -1;
+ pool_fd[i].events = 0;
+ continue;
+ }
+
+ /* prepare select params */
+ if (connections[i]->sock > 0)
+ {
+ pool_fd[i].fd = connections[i]->sock;
+ pool_fd[i].events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
+ sockets_to_poll++;
+ }
+ else
+ {
+ /* flag as bad, it will be removed from the list */
+ PGXCNodeSetConnectionState(connections[i],
+ DN_CONNECTION_STATE_ERROR_FATAL);
+ pool_fd[i].fd = -1;
+ pool_fd[i].events = 0;
+ }
+ }
+
+ /*
+ * Return if we do not have connections to receive input
+ */
+ if (sockets_to_poll == 0)
+ {
+ if (is_msg_buffered)
+ return NO_ERROR_OCCURED;
+ return ERROR_OCCURED;
+ }
+
+ /* do conversion from the select behaviour */
+ if ( timeout == NULL )
+ timeout_ms = -1;
+ else
+ timeout_ms = (timeout->tv_sec * (uint64_t) 1000) + (timeout->tv_usec / 1000);
+
+retry:
+ CHECK_FOR_INTERRUPTS();
+ poll_val = poll(pool_fd, conn_count, timeout_ms);
+ if (poll_val < 0)
+ {
+ /* error - retry if EINTR */
+ if (errno == EINTR || errno == EAGAIN)
+ goto retry;
+
+ elog(WARNING, "poll() error: %d", errno);
+ if (errno)
+ return ERROR_OCCURED;
+ return NO_ERROR_OCCURED;
+ }
+
+ if (poll_val == 0)
+ {
+ /* Handle timeout */
+ elog(DEBUG1, "timeout %ld while waiting for any response from %d connections", timeout_ms,conn_count);
+ for (i = 0; i < conn_count; i++)
+ PGXCNodeSetConnectionState(connections[i],
+ DN_CONNECTION_STATE_ERROR_FATAL);
+ return NO_ERROR_OCCURED;
+ }
+
+ /* read data */
+ for (i = 0; i < conn_count; i++)
+ {
+ PGXCNodeHandle *conn = connections[i];
+
+ if( pool_fd[i].fd == -1 )
+ continue;
+
+ if ( pool_fd[i].fd == conn->sock )
+ {
+ if( pool_fd[i].revents & POLLIN )
+ {
+ int read_status = pgxc_node_read_data(conn, true);
+ if ( read_status == EOF || read_status < 0 )
+ {
+ /* Can not read - no more actions, just discard connection */
+ PGXCNodeSetConnectionState(conn,
+ DN_CONNECTION_STATE_ERROR_FATAL);
+ add_error_message(conn, "unexpected EOF on datanode connection.");
+ elog(WARNING, "unexpected EOF on datanode oid connection: %d", conn->nodeoid);
+
+ /*
+ * before returning, also update the shared health
+ * status field to indicate that this node could be
+ * possibly unavailable.
+ *
+ * Note that this error could be due to a stale handle
+ * and it's possible that another backend might have
+ * already updated the health status OR the node
+ * might have already come back since the last disruption
+ */
+ PoolPingNodeRecheck(conn->nodeoid);
+
+ /* Should we read from the other connections before returning? */
+ return ERROR_OCCURED;
+ }
+
+ }
+ else if (
+ (pool_fd[i].revents & POLLERR) ||
+ (pool_fd[i].revents & POLLHUP) ||
+ (pool_fd[i].revents & POLLNVAL)
+ )
+ {
+ PGXCNodeSetConnectionState(connections[i],
+ DN_CONNECTION_STATE_ERROR_FATAL);
+ add_error_message(conn, "unexpected network error on datanode connection");
+ elog(WARNING, "unexpected EOF on datanode oid connection: %d with event %d", conn->nodeoid,pool_fd[i].revents);
+ /* Should we check/read from the other connections before returning? */
+ return ERROR_OCCURED;
+ }
+ }
+ }
+ return NO_ERROR_OCCURED;
+}
+
+/*
+ * Is there any data enqueued in the TCP input buffer waiting
+ * to be read sent by the PGXC node connection
+ */
+
+int
+pgxc_node_is_data_enqueued(PGXCNodeHandle *conn)
+{
+ int ret;
+ int enqueued;
+
+ if (conn->sock < 0)
+ return 0;
+ ret = ioctl(conn->sock, FIONREAD, &enqueued);
+ if (ret != 0)
+ return 0;
+
+ return enqueued;
+}
+
+/*
+ * Read up incoming messages from the PGXC node connection
+ */
+int
+pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error)
+{
+ int someread = 0;
+ int nread;
+
+ if (conn->sock < 0)
+ {
+ if (close_if_error)
+ add_error_message(conn, "bad socket");
+ return EOF;
+ }
+
+ /* Left-justify any data in the buffer to make room */
+ if (conn->inStart < conn->inEnd)
+ {
+ if (conn->inStart > 0)
+ {
+ memmove(conn->inBuffer, conn->inBuffer + conn->inStart,
+ conn->inEnd - conn->inStart);
+ conn->inEnd -= conn->inStart;
+ conn->inCursor -= conn->inStart;
+ conn->inStart = 0;
+ }
+ }
+ else
+ {
+ /* buffer is logically empty, reset it */
+ conn->inStart = conn->inCursor = conn->inEnd = 0;
+ }
+
+ /*
+ * If the buffer is fairly full, enlarge it. We need to be able to enlarge
+ * the buffer in case a single message exceeds the initial buffer size. We
+ * enlarge before filling the buffer entirely so as to avoid asking the
+ * kernel for a partial packet. The magic constant here should be large
+ * enough for a TCP packet or Unix pipe bufferload. 8K is the usual pipe
+ * buffer size, so...
+ */
+ if (conn->inSize - conn->inEnd < 8192)
+ {
+ if (ensure_in_buffer_capacity(conn->inEnd + (size_t) 8192, conn) != 0)
+ {
+ /*
+ * We don't insist that the enlarge worked, but we need some room
+ */
+ if (conn->inSize - conn->inEnd < 100)
+ {
+ if (close_if_error)
+ add_error_message(conn, "can not allocate buffer");
+ return -1;
+ }
+ }
+ }
+
+retry:
+ nread = recv(conn->sock, conn->inBuffer + conn->inEnd,
+ conn->inSize - conn->inEnd, 0);
+
+ if (nread < 0)
+ {
+ if (errno == EINTR)
+ goto retry;
+ /* Some systems return EAGAIN/EWOULDBLOCK for no data */
+#ifdef EAGAIN
+ if (errno == EAGAIN)
+ return someread;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ if (errno == EWOULDBLOCK)
+ return someread;
+#endif
+ /* We might get ECONNRESET here if using TCP and backend died */
+#ifdef ECONNRESET
+ if (errno == ECONNRESET)
+ {
+ /*
+ * OK, we are getting a zero read even though select() says ready. This
+ * means the connection has been closed. Cope.
+ */
+ if (close_if_error)
+ {
+ add_error_message(conn,
+ "Datanode closed the connection unexpectedly\n"
+ "\tThis probably means the Datanode terminated abnormally\n"
+ "\tbefore or while processing the request.\n");
+ PGXCNodeSetConnectionState(conn,
+ DN_CONNECTION_STATE_ERROR_FATAL); /* No more connection to
+ * backend */
+ closesocket(conn->sock);
+ conn->sock = NO_SOCKET;
+ }
+ return -1;
+ }
+#endif
+ if (close_if_error)
+ add_error_message(conn, "could not receive data from server");
+ return -1;
+
+ }
+
+ if (nread > 0)
+ {
+ conn->inEnd += nread;
+
+ /*
+ * Hack to deal with the fact that some kernels will only give us back
+ * 1 packet per recv() call, even if we asked for more and there is
+ * more available. If it looks like we are reading a long message,
+ * loop back to recv() again immediately, until we run out of data or
+ * buffer space. Without this, the block-and-restart behavior of
+ * libpq's higher levels leads to O(N^2) performance on long messages.
+ *
+ * Since we left-justified the data above, conn->inEnd gives the
+ * amount of data already read in the current message. We consider
+ * the message "long" once we have acquired 32k ...
+ */
+ if (conn->inEnd > 32768 &&
+ (conn->inSize - conn->inEnd) >= 8192)
+ {
+ someread = 1;
+ goto retry;
+ }
+ return 1;
+ }
+
+ if (nread == 0)
+ {
+ if (close_if_error)
+ elog(DEBUG1, "nread returned 0");
+ return EOF;
+ }
+
+ if (someread)
+ return 1; /* got a zero read after successful tries */
+
+ return 0;
+}
+
+
+/*
+ * Get one character from the connection buffer and advance cursor
+ */
+static int
+get_char(PGXCNodeHandle * conn, char *out)
+{
+ if (conn->inCursor < conn->inEnd)
+ {
+ *out = conn->inBuffer[conn->inCursor++];
+ return 0;
+ }
+ return EOF;
+}
+
+/*
+ * Read an integer from the connection buffer and advance cursor
+ */
+static int
+get_int(PGXCNodeHandle *conn, size_t len, int *out)
+{
+ unsigned short tmp2;
+ unsigned int tmp4;
+
+ if (conn->inCursor + len > conn->inEnd)
+ return EOF;
+
+ switch (len)
+ {
+ case 2:
+ memcpy(&tmp2, conn->inBuffer + conn->inCursor, 2);
+ conn->inCursor += 2;
+ *out = (int) ntohs(tmp2);
+ break;
+ case 4:
+ memcpy(&tmp4, conn->inBuffer + conn->inCursor, 4);
+ conn->inCursor += 4;
+ *out = (int) ntohl(tmp4);
+ break;
+ default:
+ add_error_message(conn, "not supported int size");
+ return EOF;
+ }
+
+ return 0;
+}
+
+
+/*
+ * get_message
+ * If connection has enough data read entire message from the connection buffer
+ * and returns message type. Message data and data length are returned as
+ * var parameters.
+ * If buffer does not have enough data leaves cursor unchanged, changes
+ * connection status to DN_CONNECTION_STATE_QUERY indicating it needs to
+ * receive more and returns \0
+ * conn - connection to read from
+ * len - returned length of the data where msg is pointing to
+ * msg - returns pointer to memory in the incoming buffer. The buffer probably
+ * will be overwritten upon next receive, so if caller wants to refer it later
+ * it should make a copy.
+ */
+char
+get_message(PGXCNodeHandle *conn, int *len, char **msg)
+{
+ char msgtype;
+
+ if (get_char(conn, &msgtype) || get_int(conn, 4, len))
+ {
+ /* Successful get_char would move cursor, restore position */
+ conn->inCursor = conn->inStart;
+ return '\0';
+ }
+
+ *len -= 4;
+
+ if (conn->inCursor + *len > conn->inEnd)
+ {
+ /*
+ * Not enough data in the buffer, we should read more.
+ * Reading function will discard already consumed data in the buffer
+ * till conn->inBegin. Then we want the message that is partly in the
+ * buffer now has been read completely, to avoid extra read/handle
+ * cycles. The space needed is 1 byte for message type, 4 bytes for
+ * message length and message itself which size is currently in *len.
+ * The buffer may already be large enough, in this case the function
+ * ensure_in_buffer_capacity() will immediately return
+ */
+ ensure_in_buffer_capacity(5 + (size_t) *len, conn);
+ conn->inCursor = conn->inStart;
+ return '\0';
+ }
+
+ *msg = conn->inBuffer + conn->inCursor;
+ conn->inCursor += *len;
+ conn->inStart = conn->inCursor;
+ return msgtype;
+}
+
+
+/*
+ * Release all Datanode and Coordinator connections
+ * back to pool and release occupied memory
+ */
+void
+release_handles(void)
+{
+ bool destroy = false;
+ int i;
+
+ if (HandlesInvalidatePending)
+ {
+ DoInvalidateRemoteHandles();
+ return;
+ }
+
+ /* don't free connection if holding a cluster lock */
+ if (cluster_ex_lock_held)
+ return;
+
+ if (datanode_count == 0 && coord_count == 0)
+ return;
+
+ /* Do not release connections if we have prepared statements on nodes */
+ if (HaveActiveDatanodeStatements())
+ return;
+
+ /* Free Datanodes handles */
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ PGXCNodeHandle *handle = &dn_handles[i];
+
+ if (handle->sock != NO_SOCKET)
+ {
+ /*
+ * Connections at this point should be completely inactive,
+ * otherwise abaandon them. We can not allow not cleaned up
+ * connection is returned to pool.
+ */
+ if (handle->state != DN_CONNECTION_STATE_IDLE ||
+ handle->transaction_status != 'I')
+ {
+ destroy = true;
+ elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped",
+ handle->nodeoid, handle->state);
+ }
+ pgxc_node_free(handle);
+ }
+ }
+
+ if (IS_PGXC_COORDINATOR)
+ {
+ /* Collect Coordinator handles */
+ for (i = 0; i < NumCoords; i++)
+ {
+ PGXCNodeHandle *handle = &co_handles[i];
+
+ if (handle->sock != NO_SOCKET)
+ {
+ /*
+ * Connections at this point should be completely inactive,
+ * otherwise abaandon them. We can not allow not cleaned up
+ * connection is returned to pool.
+ */
+ if (handle->state != DN_CONNECTION_STATE_IDLE ||
+ handle->transaction_status != 'I')
+ {
+ destroy = true;
+ elog(DEBUG1, "Connection to Coordinator %d has unexpected state %d and will be dropped",
+ handle->nodeoid, handle->state);
+ }
+ pgxc_node_free(handle);
+ }
+ }
+ }
+
+ /* And finally release all the connections on pooler */
+ PoolManagerReleaseConnections(destroy);
+
+ datanode_count = 0;
+ coord_count = 0;
+}
+
+/*
+ * Ensure that the supplied buffer has enough capacity and if not, it's
+ * extended to an appropriate size.
+ *
+ * currbuf is the currently used buffer of currsize. bytes_needed is the
+ * minimum size required. We shall return the new buffer, if allocated
+ * successfully and set newsize_p to contain the size of the repalloced buffer.
+ * If allocation fails, NULL is returned.
+ *
+ * The function checks for requests beyond MaxAllocSize and throw an error.
+ */
+static char *
+ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size_t *newsize_p)
+{
+ char *newbuf;
+ Size newsize = (Size) currsize;
+
+ if (((Size) bytes_needed) >= MaxAllocSize)
+ ereport(ERROR,
+ (ENOSPC,
+ errmsg("out of memory"),
+ errdetail("Cannot enlarge buffer containing %ld bytes by %ld more bytes.",
+ currsize, bytes_needed)));
+
+ if (bytes_needed <= newsize)
+ {
+ *newsize_p = currsize;
+ return currbuf;
+ }
+
+ /*
+ * The current size of the buffer should never be zero (init_pgxc_handle
+ * guarantees that.
+ */
+ Assert(newsize > 0);
+
+ /*
+ * Double the buffer size until we have enough space to hold bytes_needed
+ */
+ while (bytes_needed > newsize)
+ newsize = 2 * newsize;
+
+ /*
+ * Clamp to MaxAllocSize in case we went past it. Note we are assuming
+ * here that MaxAllocSize <= INT_MAX/2, else the above loop could
+ * overflow. We will still have newsize >= bytes_needed.
+ */
+ if (newsize > (int) MaxAllocSize)
+ newsize = (int) MaxAllocSize;
+
+ newbuf = repalloc(currbuf, newsize);
+ if (newbuf)
+ {
+ /* repalloc succeeded, set new size and return the buffer */
+ *newsize_p = newsize;
+ return newbuf;
+ }
+
+ /*
+ * If we fail to double the buffer, try to repalloc a buffer of the given
+ * size, rounded to the next multiple of 8192 and see if that works.
+ */
+ newsize = bytes_needed;
+ newsize = ((bytes_needed / 8192) + 1) * 8192;
+
+ newbuf = repalloc(currbuf, newsize);
+ if (newbuf)
+ {
+ /* repalloc succeeded, set new size and return the buffer */
+ *newsize_p = newsize;
+ return newbuf;
+ }
+
+ /* repalloc failed */
+ return NULL;
+}
+
+/*
+ * Ensure specified amount of data can fit to the incoming buffer and
+ * increase it if necessary
+ */
+int
+ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
+{
+ size_t newsize;
+ char *newbuf = ensure_buffer_capacity(handle->inBuffer, handle->inSize,
+ bytes_needed, &newsize);
+ if (newbuf)
+ {
+ handle->inBuffer = newbuf;
+ handle->inSize = newsize;
+ return 0;
+ }
+ return EOF;
+}
+
+/*
+ * Ensure specified amount of data can fit to the outgoing buffer and
+ * increase it if necessary
+ */
+int
+ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
+{
+ size_t newsize;
+ char *newbuf = ensure_buffer_capacity(handle->outBuffer, handle->outSize,
+ bytes_needed, &newsize);
+ if (newbuf)
+ {
+ handle->outBuffer = newbuf;
+ handle->outSize = newsize;
+ return 0;
+ }
+ return EOF;
+}
+
+
+/*
+ * Send specified amount of data from the outgoing buffer over the connection
+ */
+int
+send_some(PGXCNodeHandle *handle, int len)
+{
+ char *ptr = handle->outBuffer;
+ int remaining = handle->outEnd;
+ int result = 0;
+
+ /* while there's still data to send */
+ while (len > 0)
+ {
+ int sent;
+
+#ifndef WIN32
+ sent = send(handle->sock, ptr, len, 0);
+#else
+ /*
+ * Windows can fail on large sends, per KB article Q201213. The failure-point
+ * appears to be different in different versions of Windows, but 64k should
+ * always be safe.
+ */
+ sent = send(handle->sock, ptr, Min(len, 65536), 0);
+#endif
+
+ if (sent < 0)
+ {
+ /*
+ * Anything except EAGAIN/EWOULDBLOCK/EINTR is trouble. If it's
+ * EPIPE or ECONNRESET, assume we've lost the backend connection
+ * permanently.
+ */
+ switch (errno)
+ {
+#ifdef EAGAIN
+ case EAGAIN:
+ break;
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ case EWOULDBLOCK:
+ break;
+#endif
+ case EINTR:
+ continue;
+
+ case EPIPE:
+#ifdef ECONNRESET
+ case ECONNRESET:
+#endif
+ add_error_message(handle, "server closed the connection unexpectedly\n"
+ "\tThis probably means the server terminated abnormally\n"
+ "\tbefore or while processing the request.\n");
+
+ /*
+ * We used to close the socket here, but that's a bad idea
+ * since there might be unread data waiting (typically, a
+ * NOTICE message from the backend telling us it's
+ * committing hara-kiri...). Leave the socket open until
+ * pqReadData finds no more data can be read. But abandon
+ * attempt to send data.
+ */
+ handle->outEnd = 0;
+ return -1;
+
+ default:
+ add_error_message(handle, "could not send data to server");
+ /* We don't assume it's a fatal error... */
+ handle->outEnd = 0;
+ return -1;
+ }
+ }
+ else
+ {
+ ptr += sent;
+ len -= sent;
+ remaining -= sent;
+ }
+
+ if (len > 0)
+ {
+ struct pollfd pool_fd;
+ int poll_ret;
+
+ /*
+ * Wait for the socket to become ready again to receive more data.
+ * For some cases, especially while writing large sums of data
+ * during COPY protocol and when the remote node is not capable of
+ * handling data at the same speed, we might otherwise go in a
+ * useless tight loop, consuming all available local resources
+ *
+ * Use a small timeout of 1s to avoid infinite wait
+ */
+ pool_fd.fd = handle->sock;
+ pool_fd.events = POLLOUT;
+
+ poll_ret = poll(&pool_fd, 1, 1000);
+ if (poll_ret < 0)
+ {
+ if (errno == EAGAIN || errno == EINTR)
+ continue;
+ else
+ {
+ add_error_message(handle, "poll failed ");
+ handle->outEnd = 0;
+ return -1;
+ }
+ }
+ else if (poll_ret == 1)
+ {
+ if (pool_fd.revents & POLLHUP)
+ {
+ add_error_message(handle, "remote end disconnected");
+ handle->outEnd = 0;
+ return -1;
+ }
+ }
+ }
+ }
+
+ /* shift the remaining contents of the buffer */
+ if (remaining > 0)
+ memmove(handle->outBuffer, ptr, remaining);
+ handle->outEnd = remaining;
+
+ return result;
+}
+
+/*
+ * Send PARSE message with specified statement down to the Datanode
+ */
+int
+pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
+ const char *query, short num_params, Oid *param_types)
+{
+ /* statement name size (allow NULL) */
+ int stmtLen = statement ? strlen(statement) + 1 : 1;
+ /* size of query string */
+ int strLen = strlen(query) + 1;
+ char **paramTypes = (char **)palloc(sizeof(char *) * num_params);
+ /* total size of parameter type names */
+ int paramTypeLen;
+ /* message length */
+ int msgLen;
+ int cnt_params;
+#ifdef USE_ASSERT_CHECKING
+ size_t old_outEnd = handle->outEnd;
+#endif
+
+ /* if there are parameters, param_types should exist */
+ Assert(num_params <= 0 || param_types);
+ /* 2 bytes for number of parameters, preceding the type names */
+ paramTypeLen = 2;
+ /* find names of the types of parameters */
+ for (cnt_params = 0; cnt_params < num_params; cnt_params++)
+ {
+ Oid typeoid;
+
+ /* Parameters with no types are simply ignored */
+ if (OidIsValid(param_types[cnt_params]))
+ typeoid = param_types[cnt_params];
+ else
+ typeoid = INT4OID;
+
+ paramTypes[cnt_params] = format_type_be(typeoid);
+ paramTypeLen += strlen(paramTypes[cnt_params]) + 1;
+ }
+
+ /* size + stmtLen + strlen + paramTypeLen */
+ msgLen = 4 + stmtLen + strLen + paramTypeLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'P';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ /* statement name */
+ if (statement)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
+ handle->outEnd += stmtLen;
+ }
+ else
+ handle->outBuffer[handle->outEnd++] = '\0';
+ /* query */
+ memcpy(handle->outBuffer + handle->outEnd, query, strLen);
+ handle->outEnd += strLen;
+ /* parameter types */
+ Assert(sizeof(num_params) == 2);
+ *((short *)(handle->outBuffer + handle->outEnd)) = htons(num_params);
+ handle->outEnd += sizeof(num_params);
+ /*
+ * instead of parameter ids we should send parameter names (qualified by
+ * schema name if required). The OIDs of types can be different on
+ * Datanodes.
+ */
+ for (cnt_params = 0; cnt_params < num_params; cnt_params++)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, paramTypes[cnt_params],
+ strlen(paramTypes[cnt_params]) + 1);
+ handle->outEnd += strlen(paramTypes[cnt_params]) + 1;
+ pfree(paramTypes[cnt_params]);
+ }
+ pfree(paramTypes);
+ Assert(old_outEnd + ntohl(msgLen) + 1 == handle->outEnd);
+
+ return 0;
+}
+
+/*
+ * Send PLAN message down to the Data node
+ */
+int
+pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
+ const char *query, const char *planstr,
+ short num_params, Oid *param_types)
+{
+ int stmtLen;
+ int queryLen;
+ int planLen;
+ int paramTypeLen;
+ int msgLen;
+ char **paramTypes = (char **)palloc(sizeof(char *) * num_params);
+ int i;
+ short tmp_num_params;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* statement name size (do not allow NULL) */
+ stmtLen = strlen(statement) + 1;
+ /* source query size (do not allow NULL) */
+ queryLen = strlen(query) + 1;
+ /* query plan size (do not allow NULL) */
+ planLen = strlen(planstr) + 1;
+ /* 2 bytes for number of parameters, preceding the type names */
+ paramTypeLen = 2;
+ /* find names of the types of parameters */
+ for (i = 0; i < num_params; i++)
+ {
+ paramTypes[i] = format_type_be(param_types[i]);
+ paramTypeLen += strlen(paramTypes[i]) + 1;
+ }
+ /* size + pnameLen + queryLen + parameters */
+ msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'p';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ /* statement name */
+ memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
+ handle->outEnd += stmtLen;
+ /* source query */
+ memcpy(handle->outBuffer + handle->outEnd, query, queryLen);
+ handle->outEnd += queryLen;
+ /* query plan */
+ memcpy(handle->outBuffer + handle->outEnd, planstr, planLen);
+ handle->outEnd += planLen;
+ /* parameter types */
+ tmp_num_params = htons(num_params);
+ memcpy(handle->outBuffer + handle->outEnd, &tmp_num_params, sizeof(tmp_num_params));
+ handle->outEnd += sizeof(tmp_num_params);
+ /*
+ * instead of parameter ids we should send parameter names (qualified by
+ * schema name if required). The OIDs of types can be different on
+ * datanodes.
+ */
+ for (i = 0; i < num_params; i++)
+ {
+ int plen = strlen(paramTypes[i]) + 1;
+ memcpy(handle->outBuffer + handle->outEnd, paramTypes[i], plen);
+ handle->outEnd += plen;
+ pfree(paramTypes[i]);
+ }
+ pfree(paramTypes);
+
+ handle->in_extended_query = true;
+ return 0;
+}
+
+/*
+ * Send BIND message down to the Datanode
+ */
+int
+pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
+ const char *statement, int paramlen, char *params)
+{
+ int pnameLen;
+ int stmtLen;
+ int paramCodeLen;
+ int paramValueLen;
+ int paramOutLen;
+ int msgLen;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* portal name size (allow NULL) */
+ pnameLen = portal ? strlen(portal) + 1 : 1;
+ /* statement name size (allow NULL) */
+ stmtLen = statement ? strlen(statement) + 1 : 1;
+ /* size of parameter codes array (always empty for now) */
+ paramCodeLen = 2;
+ /* size of parameter values array, 2 if no params */
+ paramValueLen = paramlen ? paramlen : 2;
+ /* size of output parameter codes array (always empty for now) */
+ paramOutLen = 2;
+ /* size + pnameLen + stmtLen + parameters */
+ msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'B';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ /* portal name */
+ if (portal)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, portal, pnameLen);
+ handle->outEnd += pnameLen;
+ }
+ else
+ handle->outBuffer[handle->outEnd++] = '\0';
+ /* statement name */
+ if (statement)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
+ handle->outEnd += stmtLen;
+ }
+ else
+ handle->outBuffer[handle->outEnd++] = '\0';
+ /* parameter codes (none) */
+ handle->outBuffer[handle->outEnd++] = 0;
+ handle->outBuffer[handle->outEnd++] = 0;
+ /* parameter values */
+ if (paramlen)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, params, paramlen);
+ handle->outEnd += paramlen;
+ }
+ else
+ {
+ handle->outBuffer[handle->outEnd++] = 0;
+ handle->outBuffer[handle->outEnd++] = 0;
+ }
+ /* output parameter codes (none) */
+ handle->outBuffer[handle->outEnd++] = 0;
+ handle->outBuffer[handle->outEnd++] = 0;
+
+ handle->in_extended_query = true;
+ return 0;
+}
+
+
+/*
+ * Send DESCRIBE message (portal or statement) down to the Datanode
+ */
+int
+pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
+ const char *name)
+{
+ int nameLen;
+ int msgLen;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* statement or portal name size (allow NULL) */
+ nameLen = name ? strlen(name) + 1 : 1;
+
+ /* size + statement/portal + name */
+ msgLen = 4 + 1 + nameLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'D';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ /* statement/portal flag */
+ handle->outBuffer[handle->outEnd++] = is_statement ? 'S' : 'P';
+ /* object name */
+ if (name)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, name, nameLen);
+ handle->outEnd += nameLen;
+ }
+ else
+ handle->outBuffer[handle->outEnd++] = '\0';
+
+ handle->in_extended_query = true;
+ return 0;
+}
+
+
+/*
+ * Send CLOSE message (portal or statement) down to the Datanode
+ */
+int
+pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
+ const char *name)
+{
+ /* statement or portal name size (allow NULL) */
+ int nameLen = name ? strlen(name) + 1 : 1;
+
+ /* size + statement/portal + name */
+ int msgLen = 4 + 1 + nameLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'C';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ /* statement/portal flag */
+ handle->outBuffer[handle->outEnd++] = is_statement ? 'S' : 'P';
+ /* object name */
+ if (name)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, name, nameLen);
+ handle->outEnd += nameLen;
+ }
+ else
+ handle->outBuffer[handle->outEnd++] = '\0';
+
+ handle->in_extended_query = true;
+ return 0;
+}
+
+/*
+ * Send EXECUTE message down to the Datanode
+ */
+int
+pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
+{
+ /* portal name size (allow NULL) */
+ int pnameLen = portal ? strlen(portal) + 1 : 1;
+
+ /* size + pnameLen + fetchLen */
+ int msgLen = 4 + pnameLen + 4;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'E';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ /* portal name */
+ if (portal)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, portal, pnameLen);
+ handle->outEnd += pnameLen;
+ }
+ else
+ handle->outBuffer[handle->outEnd++] = '\0';
+
+ /* fetch */
+ fetch = htonl(fetch);
+ memcpy(handle->outBuffer + handle->outEnd, &fetch, 4);
+ handle->outEnd += 4;
+
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+
+ handle->in_extended_query = true;
+ return 0;
+}
+
+
+/*
+ * Send FLUSH message down to the Datanode
+ */
+int
+pgxc_node_send_flush(PGXCNodeHandle * handle)
+{
+ /* size */
+ int msgLen = 4;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'H';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+
+ handle->in_extended_query = true;
+ return pgxc_node_flush(handle);
+}
+
+
+/*
+ * Send SYNC message down to the Datanode
+ */
+int
+pgxc_node_send_sync(PGXCNodeHandle * handle)
+{
+ /* size */
+ int msgLen = 4;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'S';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+
+ handle->in_extended_query = false;
+ handle->needSync = false;
+
+ return pgxc_node_flush(handle);
+}
+
+
+/*
+ * Send series of Extended Query protocol messages to the data node
+ */
+int
+pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
+ const char *statement, const char *portal,
+ int num_params, Oid *param_types,
+ int paramlen, char *params,
+ bool send_describe, int fetch_size)
+{
+ /* NULL query indicates already prepared statement */
+ if (query)
+ if (pgxc_node_send_parse(handle, statement, query, num_params, param_types))
+ return EOF;
+ if (pgxc_node_send_bind(handle, portal, statement, paramlen, params))
+ return EOF;
+ if (send_describe)
+ if (pgxc_node_send_describe(handle, false, portal))
+ return EOF;
+ if (fetch_size >= 0)
+ if (pgxc_node_send_execute(handle, portal, fetch_size))
+ return EOF;
+ if (pgxc_node_send_flush(handle))
+ return EOF;
+
+ return 0;
+}
+
+
+/*
+ * This method won't return until connection buffer is empty or error occurs
+ * To ensure all data are on the wire before waiting for response
+ */
+int
+pgxc_node_flush(PGXCNodeHandle *handle)
+{
+ while (handle->outEnd)
+ {
+ if (send_some(handle, handle->outEnd) < 0)
+ {
+ add_error_message(handle, "failed to send data to datanode");
+
+ /*
+ * before returning, also update the shared health
+ * status field to indicate that this node could be
+ * possibly unavailable.
+ *
+ * Note that this error could be due to a stale handle
+ * and it's possible that another backend might have
+ * already updated the health status OR the node
+ * might have already come back since the last disruption
+ */
+ PoolPingNodeRecheck(handle->nodeoid);
+ return EOF;
+ }
+ }
+ return 0;
+}
+
+/*
+ * This method won't return until network buffer is empty or error occurs
+ * To ensure all data in network buffers is read and wasted
+ */
+void
+pgxc_node_flush_read(PGXCNodeHandle *handle)
+{
+ bool is_ready;
+ int read_result;
+
+ if (handle == NULL)
+ return;
+
+ /*
+ * Before reading input send Sync to make sure
+ * we will eventually receive ReadyForQuery
+ */
+ pgxc_node_send_sync(handle);
+ while(true)
+ {
+ read_result = pgxc_node_read_data(handle, false);
+ if (read_result < 0)
+ break;
+
+ is_ready = is_data_node_ready(handle);
+ if (is_ready == true)
+ break;
+
+ }
+}
+
+/*
+ * Send specified statement down to the PGXC node
+ */
+static int
+pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
+ bool rollback)
+{
+ int strLen;
+ int msgLen;
+
+ /*
+ * Its appropriate to send ROLLBACK commands on a failed connection, but
+ * for everything else we expect the connection to be in a sane state
+ */
+ elog(DEBUG5, "pgxc_node_send_query - handle->state %d, node %s, query %s",
+ handle->state, handle->nodename, query);
+ if ((handle->state != DN_CONNECTION_STATE_IDLE) &&
+ !(handle->state == DN_CONNECTION_STATE_ERROR_FATAL && rollback))
+ return EOF;
+
+ strLen = strlen(query) + 1;
+ /* size + strlen */
+ msgLen = 4 + strLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'Q';
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ memcpy(handle->outBuffer + handle->outEnd, query, strLen);
+ handle->outEnd += strLen;
+
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+
+ handle->in_extended_query = false;
+ return pgxc_node_flush(handle);
+}
+
+int
+pgxc_node_send_rollback(PGXCNodeHandle *handle, const char *query)
+{
+ return pgxc_node_send_query_internal(handle, query, true);
+}
+
+int
+pgxc_node_send_query(PGXCNodeHandle *handle, const char *query)
+{
+ return pgxc_node_send_query_internal(handle, query, false);
+}
+
+
+/*
+ * Send the GXID down to the PGXC node
+ */
+int
+pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
+{
+ int msglen = 8;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'g';
+ msglen = htonl(msglen);
+ memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ handle->outEnd += 4;
+ memcpy(handle->outBuffer + handle->outEnd, &gxid, sizeof
+ (TransactionId));
+ handle->outEnd += sizeof (TransactionId);
+
+ return 0;
+}
+
+/*
+ * Send the Command ID down to the PGXC node
+ */
+int
+pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
+{
+ int msglen = CMD_ID_MSG_LEN;
+ int i32;
+
+ /* No need to send command ID if its sending flag is not enabled */
+ if (!IsSendCommandId())
+ return 0;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'M';
+ msglen = htonl(msglen);
+ memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ handle->outEnd += 4;
+ i32 = htonl(cid);
+ memcpy(handle->outBuffer + handle->outEnd, &i32, 4);
+ handle->outEnd += 4;
+
+ return 0;
+}
+
+/*
+ * Send the snapshot down to the PGXC node
+ */
+int
+pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
+{
+ int msglen;
+ int nval;
+ int i;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* calculate message length */
+ msglen = 20;
+ if (snapshot->xcnt > 0)
+ msglen += snapshot->xcnt * 4;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 's';
+ msglen = htonl(msglen);
+ memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ handle->outEnd += 4;
+
+ memcpy(handle->outBuffer + handle->outEnd, &snapshot->xmin, sizeof (TransactionId));
+ handle->outEnd += sizeof (TransactionId);
+
+ memcpy(handle->outBuffer + handle->outEnd, &snapshot->xmax, sizeof (TransactionId));
+ handle->outEnd += sizeof (TransactionId);
+
+ memcpy(handle->outBuffer + handle->outEnd, &RecentGlobalXmin, sizeof (TransactionId));
+ handle->outEnd += sizeof (TransactionId);
+
+ nval = htonl(snapshot->xcnt);
+ memcpy(handle->outBuffer + handle->outEnd, &nval, 4);
+ handle->outEnd += 4;
+
+ for (i = 0; i < snapshot->xcnt; i++)
+ {
+ memcpy(handle->outBuffer + handle->outEnd, &snapshot->xip[i], sizeof
+ (TransactionId));
+ handle->outEnd += sizeof (TransactionId);
+ }
+
+ return 0;
+}
+
+/*
+ * Send the timestamp down to the PGXC node
+ */
+int
+pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
+{
+ int msglen = 12; /* 4 bytes for msglen and 8 bytes for timestamp (int64) */
+ uint32 n32;
+ int64 i = (int64) timestamp;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+ handle->outBuffer[handle->outEnd++] = 't';
+ msglen = htonl(msglen);
+ memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+ handle->outEnd += 4;
+
+ /* High order half first */
+#ifdef INT64_IS_BUSTED
+ /* don't try a right shift of 32 on a 32-bit word */
+ n32 = (i < 0) ? -1 : 0;
+#else
+ n32 = (uint32) (i >> 32);
+#endif
+ n32 = htonl(n32);
+ memcpy(handle->outBuffer + handle->outEnd, &n32, 4);
+ handle->outEnd += 4;
+
+ /* Now the low order half */
+ n32 = (uint32) i;
+ n32 = htonl(n32);
+ memcpy(handle->outBuffer + handle->outEnd, &n32, 4);
+ handle->outEnd += 4;
+
+ return 0;
+}
+
+
+/*
+ * Add another message to the list of errors to be returned back to the client
+ * at the convenient time
+ */
+void
+add_error_message(PGXCNodeHandle *handle, const char *message)
+{
+ elog(LOG, "Remote node \"%s\", running with pid %d returned an error: %s",
+ handle->nodename, handle->backend_pid, message);
+ handle->transaction_status = 'E';
+ if (handle->error)
+ {
+ /* PGXCTODO append */
+ }
+ else
+ handle->error = pstrdup(message);
+}
+
+static int load_balancer = 0;
+/*
+ * Get one of the specified nodes to query replicated data source.
+ * If session already owns one or more of the requested connection,
+ * the function returns existing one to avoid contacting pooler.
+ * Performs basic load balancing.
+ */
+PGXCNodeHandle *
+get_any_handle(List *datanodelist)
+{
+ ListCell *lc1;
+ int i, node;
+
+ /* sanity check */
+ Assert(list_length(datanodelist) > 0);
+
+ if (HandlesInvalidatePending)
+ if (DoInvalidateRemoteHandles())
+ ereport(ERROR,
+ (errcode(ERRCODE_QUERY_CANCELED),
+ errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+
+ if (HandlesRefreshPending)
+ if (DoRefreshRemoteHandles())
+ ereport(ERROR,
+ (errcode(ERRCODE_QUERY_CANCELED),
+ errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+
+ /* loop through local datanode handles */
+ for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
+ {
+ /* At the moment node is an index in the array, and we may need to wrap it */
+ if (node >= NumDataNodes)
+ node -= NumDataNodes;
+ /* See if handle is already used */
+ if (dn_handles[node].sock != NO_SOCKET)
+ {
+ foreach(lc1, datanodelist)
+ {
+ if (lfirst_int(lc1) == node)
+ {
+ /*
+ * The node is in the list of requested nodes,
+ * set load_balancer for next time and return the handle
+ */
+ load_balancer = node + 1;
+ return &dn_handles[node];
+ }
+ }
+ }
+ }
+
+ /*
+ * None of requested nodes is in use, need to get one from the pool.
+ * Choose one.
+ */
+ for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
+ {
+ /* At the moment node is an index in the array, and we may need to wrap it */
+ if (node >= NumDataNodes)
+ node -= NumDataNodes;
+ /* Look only at empty slots, we have already checked existing handles */
+ if (dn_handles[node].sock == NO_SOCKET)
+ {
+ foreach(lc1, datanodelist)
+ {
+ if (lfirst_int(lc1) == node)
+ {
+ /* The node is requested */
+ List *allocate = list_make1_int(node);
+ int *pids;
+ int *fds = PoolManagerGetConnections(allocate, NIL,
+ &pids);
+ PGXCNodeHandle *node_handle;
+
+ if (!fds)
+ {
+ Assert(pids != NULL);
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("Failed to get pooled connections"),
+ errhint("This may happen because one or more nodes are "
+ "currently unreachable, either because of node or "
+ "network failure.\n Its also possible that the target node "
+ "may have hit the connection limit or the pooler is "
+ "configured with low connections.\n Please check "
+ "if all nodes are running fine and also review "
+ "max_connections and max_pool_size configuration "
+ "parameters")));
+ }
+ node_handle = &dn_handles[node];
+ pgxc_node_init(node_handle, fds[0], true, pids[0]);
+ datanode_count++;
+
+ elog(DEBUG1, "Established a connection with datanode \"%s\","
+ "remote backend PID %d, socket fd %d, global session %c",
+ node_handle->nodename, (int) pids[0], fds[0], 'T');
+
+ /*
+ * set load_balancer for next time and return the handle
+ */
+ load_balancer = node + 1;
+ return &dn_handles[node];
+ }
+ }
+ }
+ }
+
+ /* We should not get here, one of the cases should be met */
+ Assert(false);
+ /* Keep compiler quiet */
+ return NULL;
+}
+
+/*
+ * for specified list return array of PGXCNodeHandles
+ * acquire from pool if needed.
+ * the lenth of returned array is the same as of nodelist
+ * For Datanodes, Special case is empty or NIL nodeList, in this case return all the nodes.
+ * The returned list should be pfree'd when no longer needed.
+ * For Coordinator, do not get a connection if Coordinator list is NIL,
+ * Coordinator fds is returned only if transaction uses a DDL
+ */
+PGXCNodeAllHandles *
+get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session)
+{
+ PGXCNodeAllHandles *result;
+ ListCell *node_list_item;
+ List *dn_allocate = NIL;
+ List *co_allocate = NIL;
+ PGXCNodeHandle *node_handle;
+
+ /* index of the result array */
+ int i = 0;
+
+ if (HandlesInvalidatePending)
+ if (DoInvalidateRemoteHandles())
+ ereport(ERROR,
+ (errcode(ERRCODE_QUERY_CANCELED),
+ errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+
+ if (HandlesRefreshPending)
+ if (DoRefreshRemoteHandles())
+ ereport(ERROR,
+ (errcode(ERRCODE_QUERY_CANCELED),
+ errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+
+ result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
+ if (!result)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ result->primary_handle = NULL;
+ result->datanode_handles = NULL;
+ result->coord_handles = NULL;
+ result->co_conn_count = list_length(coordlist);
+ result->dn_conn_count = list_length(datanodelist);
+
+ /*
+ * Get Handles for Datanodes
+ * If node list is empty execute request on current nodes.
+ * It is also possible that the query has to be launched only on Coordinators.
+ */
+ if (!is_coord_only_query)
+ {
+ if (list_length(datanodelist) == 0)
+ {
+ /*
+ * We do not have to zero the array - on success all items will be set
+ * to correct pointers, on error the array will be freed
+ */
+ result->datanode_handles = (PGXCNodeHandle **)
+ palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+ if (!result->datanode_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ node_handle = &dn_handles[i];
+ result->datanode_handles[i] = node_handle;
+ if (node_handle->sock == NO_SOCKET)
+ dn_allocate = lappend_int(dn_allocate, i);
+ }
+ }
+ else
+ {
+ /*
+ * We do not have to zero the array - on success all items will be set
+ * to correct pointers, on error the array will be freed
+ */
+
+ result->datanode_handles = (PGXCNodeHandle **)
+ palloc(list_length(datanodelist) * sizeof(PGXCNodeHandle *));
+ if (!result->datanode_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ i = 0;
+ foreach(node_list_item, datanodelist)
+ {
+ int node = lfirst_int(node_list_item);
+
+ if (node < 0 || node >= NumDataNodes)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("Invalid Datanode number")));
+ }
+
+ node_handle = &dn_handles[node];
+ result->datanode_handles[i++] = node_handle;
+ if (node_handle->sock == NO_SOCKET)
+ dn_allocate = lappend_int(dn_allocate, node);
+ }
+ }
+ }
+
+ /*
+ * Get Handles for Coordinators
+ * If node list is empty execute request on current nodes
+ * There are transactions where the Coordinator list is NULL Ex:COPY
+ */
+
+ if (coordlist)
+ {
+ if (list_length(coordlist) == 0)
+ {
+ /*
+ * We do not have to zero the array - on success all items will be set
+ * to correct pointers, on error the array will be freed
+ */
+ result->coord_handles = (PGXCNodeHandle **)palloc(NumCoords * sizeof(PGXCNodeHandle *));
+ if (!result->coord_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ for (i = 0; i < NumCoords; i++)
+ {
+ node_handle = &co_handles[i];
+ result->coord_handles[i] = node_handle;
+ if (node_handle->sock == NO_SOCKET)
+ co_allocate = lappend_int(co_allocate, i);
+ }
+ }
+ else
+ {
+ /*
+ * We do not have to zero the array - on success all items will be set
+ * to correct pointers, on error the array will be freed
+ */
+ result->coord_handles = (PGXCNodeHandle **)
+ palloc(list_length(coordlist) * sizeof(PGXCNodeHandle *));
+ if (!result->coord_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ i = 0;
+ /* Some transactions do not need Coordinators, ex: COPY */
+ foreach(node_list_item, coordlist)
+ {
+ int node = lfirst_int(node_list_item);
+
+ if (node < 0 || node >= NumCoords)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("Invalid coordinator number")));
+ }
+
+ node_handle = &co_handles[node];
+
+ result->coord_handles[i++] = node_handle;
+ if (node_handle->sock == NO_SOCKET)
+ co_allocate = lappend_int(co_allocate, node);
+ }
+ }
+ }
+
+ /*
+ * Pooler can get activated even if list of Coordinator or Datanode is NULL
+ * If both lists are NIL, we don't need to call Pooler.
+ */
+ if (dn_allocate || co_allocate)
+ {
+ int j = 0;
+ int *pids;
+ int *fds = PoolManagerGetConnections(dn_allocate, co_allocate, &pids);
+
+ if (!fds)
+ {
+ if (coordlist)
+ if (result->coord_handles)
+ pfree(result->coord_handles);
+ if (datanodelist)
+ if (result->datanode_handles)
+ pfree(result->datanode_handles);
+
+ pfree(result);
+ if (dn_allocate)
+ list_free(dn_allocate);
+ if (co_allocate)
+ list_free(co_allocate);
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("Failed to get pooled connections"),
+ errhint("This may happen because one or more nodes are "
+ "currently unreachable, either because of node or "
+ "network failure.\n Its also possible that the target node "
+ "may have hit the connection limit or the pooler is "
+ "configured with low connections.\n Please check "
+ "if all nodes are running fine and also review "
+ "max_connections and max_pool_size configuration "
+ "parameters")));
+ }
+ /* Initialisation for Datanodes */
+ if (dn_allocate)
+ {
+ foreach(node_list_item, dn_allocate)
+ {
+ int node = lfirst_int(node_list_item);
+ int fdsock = fds[j];
+ int be_pid = pids[j++];
+
+ if (node < 0 || node >= NumDataNodes)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("Invalid Datanode number")));
+ }
+
+ node_handle = &dn_handles[node];
+ pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
+ dn_handles[node] = *node_handle;
+ datanode_count++;
+
+ elog(DEBUG1, "Established a connection with datanode \"%s\","
+ "remote backend PID %d, socket fd %d, global session %c",
+ node_handle->nodename, (int) be_pid, fdsock,
+ is_global_session ? 'T' : 'F');
+ }
+ }
+ /* Initialisation for Coordinators */
+ if (co_allocate)
+ {
+ foreach(node_list_item, co_allocate)
+ {
+ int node = lfirst_int(node_list_item);
+ int be_pid = pids[j];
+ int fdsock = fds[j++];
+
+ if (node < 0 || node >= NumCoords)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("Invalid coordinator number")));
+ }
+
+ node_handle = &co_handles[node];
+ pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
+ co_handles[node] = *node_handle;
+ coord_count++;
+
+ elog(DEBUG1, "Established a connection with coordinator \"%s\","
+ "remote backend PID %d, socket fd %d, global session %c",
+ node_handle->nodename, (int) be_pid, fdsock,
+ is_global_session ? 'T' : 'F');
+ }
+ }
+
+ pfree(fds);
+
+ if (co_allocate)
+ list_free(co_allocate);
+ if (dn_allocate)
+ list_free(dn_allocate);
+ }
+
+ return result;
+}
+
+PGXCNodeAllHandles *
+get_current_handles(void)
+{
+ PGXCNodeAllHandles *result;
+ PGXCNodeHandle *node_handle;
+ int i;
+
+ result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
+ if (!result)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ result->primary_handle = NULL;
+ result->co_conn_count = 0;
+ result->dn_conn_count = 0;
+
+ result->datanode_handles = (PGXCNodeHandle **)
+ palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+ if (!result->datanode_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ node_handle = &dn_handles[i];
+ if (node_handle->sock != NO_SOCKET)
+ result->datanode_handles[result->dn_conn_count++] = node_handle;
+ }
+
+ result->coord_handles = (PGXCNodeHandle **)
+ palloc(NumCoords * sizeof(PGXCNodeHandle *));
+ if (!result->coord_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ for (i = 0; i < NumCoords; i++)
+ {
+ node_handle = &co_handles[i];
+ if (node_handle->sock != NO_SOCKET)
+ result->coord_handles[result->co_conn_count++] = node_handle;
+ }
+
+ return result;
+}
+
+/* Free PGXCNodeAllHandles structure */
+void
+pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
+{
+ if (!pgxc_handles)
+ return;
+
+ if (pgxc_handles->primary_handle)
+ pfree(pgxc_handles->primary_handle);
+ if (pgxc_handles->datanode_handles)
+ pfree(pgxc_handles->datanode_handles);
+ if (pgxc_handles->coord_handles)
+ pfree(pgxc_handles->coord_handles);
+
+ pfree(pgxc_handles);
+}
+
+/*
+ * PGXCNodeGetNodeId
+ * Look at the data cached for handles and return node position
+ * If node type is PGXC_NODE_COORDINATOR look only in coordinator list,
+ * if node type is PGXC_NODE_DATANODE look only in datanode list,
+ * if other (assume PGXC_NODE_NODE) search both, in last case return actual
+ * node type.
+ */
+int
+PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
+{
+ int i;
+
+ /* First check datanodes, they referenced more often */
+ if (node_type == NULL || *node_type != PGXC_NODE_COORDINATOR)
+ {
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ if (dn_handles[i].nodeoid == nodeoid)
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_DATANODE;
+ return i;
+ }
+ }
+ }
+ /* Then check coordinators */
+ if (node_type == NULL || *node_type != PGXC_NODE_DATANODE)
+ {
+ for (i = 0; i < NumCoords; i++)
+ {
+ if (co_handles[i].nodeoid == nodeoid)
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_COORDINATOR;
+ return i;
+ }
+ }
+ }
+ /* Not found, have caller handling it */
+ if (node_type)
+ *node_type = PGXC_NODE_NONE;
+ return -1;
+}
+
+/*
+ * PGXCNodeGetNodeOid
+ * Look at the data cached for handles and return node Oid
+ */
+Oid
+PGXCNodeGetNodeOid(int nodeid, char node_type)
+{
+ PGXCNodeHandle *handles;
+
+ switch (node_type)
+ {
+ case PGXC_NODE_COORDINATOR:
+ handles = co_handles;
+ break;
+ case PGXC_NODE_DATANODE:
+ handles = dn_handles;
+ break;
+ default:
+ /* Should not happen */
+ Assert(0);
+ return InvalidOid;
+ }
+
+ return handles[nodeid].nodeoid;
+}
+
+/*
+ * pgxc_node_str
+ *
+ * get the name of the node
+ */
+Datum
+pgxc_node_str(PG_FUNCTION_ARGS)
+{
++ PG_RETURN_TEXT_P(cstring_to_text(PGXCNodeName));
+}
+
+/*
+ * PGXCNodeGetNodeIdFromName
+ * Return node position in handles array
+ */
+int
+PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
+{
+ char *nm;
+ Oid nodeoid;
+
+ if (node_name == NULL)
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_NONE;
+ return -1;
+ }
+
+ nm = str_tolower(node_name, strlen(node_name), DEFAULT_COLLATION_OID);
+
+ nodeoid = get_pgxc_nodeoid(nm);
+ pfree(nm);
+ if (!OidIsValid(nodeoid))
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_NONE;
+ return -1;
+ }
+
+ return PGXCNodeGetNodeId(nodeoid, node_type);
+}
+
+static List *
+paramlist_delete_param(List *param_list, const char *name)
+{
+ ListCell *cur_item;
+ ListCell *prev_item;
+
+ prev_item = NULL;
+ cur_item = list_head(param_list);
+
+ while (cur_item != NULL)
+ {
+ ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
+
+ if (strcmp(NameStr(entry->name), name) == 0)
+ {
+ /* cur_item must be removed */
+ param_list = list_delete_cell(param_list, cur_item, prev_item);
+ pfree(entry);
+ if (prev_item)
+ cur_item = lnext(prev_item);
+ else
+ cur_item = list_head(param_list);
+ }
+ else
+ {
+ prev_item = cur_item;
+ cur_item = lnext(prev_item);
+ }
+ }
+
+ return param_list;
+}
+
+/*
+ * Remember new value of a session or transaction parameter, and set same
+ * values on newly connected remote nodes.
+ */
+void
+PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
+{
+ List *param_list;
+ MemoryContext oldcontext;
+
+ /* Get the target hash table and invalidate command string */
+ if (local)
+ {
+ param_list = local_param_list;
+ if (local_params)
+ resetStringInfo(local_params);
+ oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+ }
+ else
+ {
+ param_list = session_param_list;
+ if (session_params)
+ resetStringInfo(session_params);
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ }
+
+ param_list = paramlist_delete_param(param_list, name);
+ if (value)
+ {
+ ParamEntry *entry;
+ entry = (ParamEntry *) palloc(sizeof (ParamEntry));
+ strlcpy((char *) (&entry->name), name, NAMEDATALEN);
+ strlcpy((char *) (&entry->value), value, NAMEDATALEN);
+ entry->flags = flags;
+
+ param_list = lappend(param_list, entry);
+ }
+
+ /*
+ * Special case for
+ * RESET SESSION AUTHORIZATION
+ * SET SESSION AUTHORIZATION TO DEFAULT
+ *
+ * We must also forget any SET ROLE commands since RESET SESSION
+ * AUTHORIZATION also resets current role to session default
+ */
+ if ((strcmp(name, "session_authorization") == 0) && (value == NULL))
+ param_list = paramlist_delete_param(param_list, "role");
+
+ if (local)
+ local_param_list = param_list;
+ else
+ session_param_list = param_list;
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+
+/*
+ * Forget all parameter values set either for transaction or both transaction
+ * and session.
+ */
+void
+PGXCNodeResetParams(bool only_local)
+{
+ if (!only_local && session_param_list)
+ {
+ /* need to explicitly pfree session stuff, it is in TopMemoryContext */
+ list_free_deep(session_param_list);
+ session_param_list = NIL;
+ if (session_params)
+ {
+ pfree(session_params->data);
+ pfree(session_params);
+ session_params = NULL;
+ }
+ }
+ /*
+ * no need to explicitly destroy the local_param_list and local_params,
+ * it will gone with the transaction memory context.
+ */
+ local_param_list = NIL;
+ local_params = NULL;
+}
+
+static void
+get_set_command(List *param_list, StringInfo command, bool local)
+{
+ ListCell *lc;
+
+ if (param_list == NIL)
+ return;
+
+ foreach (lc, param_list)
+ {
+ ParamEntry *entry = (ParamEntry *) lfirst(lc);
+ char *value = NameStr(entry->value);
+
+ if (strlen(value) == 0)
+ value = "''";
+
+ value = quote_guc_value(value, entry->flags);
+
+ appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "",
+ NameStr(entry->name), value);
+ }
+}
+
+
+/*
+ * Returns SET commands needed to initialize remote session.
+ * The command may already be biult and valid, return it right away if the case.
+ * Otherwise build it up.
+ * To support Distributed Session machinery coordinator should generate and
+ * send a distributed session identifier to remote nodes. Generate it here.
+ */
+char *
+PGXCNodeGetSessionParamStr(void)
+{
+ /*
+ * If no session parameters are set and that is a coordinator we need to set
+ * global_session anyway, even if there were no other parameters.
+ * We do not want this string to disappear, so create it in the
+ * TopMemoryContext. However if we add first session parameter we will need
+ * to free the buffer and recreate it in the same context as the hash table
+ * to avoid memory leakage.
+ */
+ if (session_params == NULL)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ session_params = makeStringInfo();
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /* If the paramstr invalid build it up */
+ if (session_params->len == 0)
+ {
+ if (IS_PGXC_COORDINATOR)
+ appendStringInfo(session_params, "SET global_session TO %s_%d;",
+ PGXCNodeName, MyProcPid);
+ get_set_command(session_param_list, session_params, false);
+ appendStringInfo(session_params, "SET parentPGXCPid TO %d;",
+ MyProcPid);
+ }
+ return session_params->len == 0 ? NULL : session_params->data;
+}
+
+
+/*
+ * Returns SET commands needed to initialize transaction on a remote session.
+ * The command may already be biult and valid, return it right away if the case.
+ * Otherwise build it up.
+ */
+char *
+PGXCNodeGetTransactionParamStr(void)
+{
+ /* If no local parameters defined there is nothing to return */
+ if (local_param_list == NIL)
+ return NULL;
+
+ /*
+ * If the paramstr invalid build it up.
+ */
+ if (local_params == NULL)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+ local_params = makeStringInfo();
+ MemoryContextSwitchTo(oldcontext);
+ }
+ /*
+ * If parameter string exists it is valid, it is truncated when parameters
+ * are modified.
+ */
+ if (local_params->len == 0)
+ {
+ get_set_command(local_param_list, local_params, true);
+ }
+ return local_params->len == 0 ? NULL : local_params->data;
+}
+
+
+/*
+ * Send down specified query, read and discard all responses until ReadyForQuery
+ */
+void
+pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
+{
+ pgxc_node_send_query(handle, set_query);
+ /*
+ * Now read responses until ReadyForQuery.
+ * XXX We may need to handle possible errors here.
+ */
+ for (;;)
+ {
+ char msgtype;
+ int msglen;
+ char *msg;
+ /*
+ * If we are in the process of shutting down, we
+ * may be rolling back, and the buffer may contain other messages.
+ * We want to avoid a procarray exception
+ * as well as an error stack overflow.
+ */
+ if (proc_exit_inprogress)
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+
+ /* don't read from from the connection if there is a fatal error */
+ if (handle->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ break;
+
+ /* No data available, read more */
+ if (!HAS_MESSAGE_BUFFERED(handle))
+ {
+ pgxc_node_receive(1, &handle, NULL);
+ continue;
+ }
+ msgtype = get_message(handle, &msglen, &msg);
+
+ /*
+ * Ignore any response except ErrorResponse and ReadyForQuery
+ */
+
+ if (msgtype == 'E') /* ErrorResponse */
+ {
+ handle->error = pstrdup(msg);
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+ break;
+ }
+
+ if (msgtype == 'Z') /* ReadyForQuery */
+ {
+ handle->transaction_status = msg[0];
+ PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_IDLE);
+ handle->combiner = NULL;
+ break;
+ }
+ }
+}
+
+
+void
+RequestInvalidateRemoteHandles(void)
+{
+ HandlesInvalidatePending = true;
+}
+
+void
+RequestRefreshRemoteHandles(void)
+{
+ HandlesRefreshPending = true;
+}
+
+bool
+PoolerMessagesPending(void)
+{
+ if (HandlesRefreshPending)
+ return true;
+
+ return false;
+}
+
+/*
+ * For all handles, mark as they are not in use and discard pending input/output
+ */
+static bool
+DoInvalidateRemoteHandles(void)
+{
+ int i;
+ PGXCNodeHandle *handle;
+ bool result = false;
+
+ HandlesInvalidatePending = false;
+ HandlesRefreshPending = false;
+
+ for (i = 0; i < NumCoords; i++)
+ {
+ handle = &co_handles[i];
+ if (handle->sock != NO_SOCKET)
+ result = true;
+ handle->sock = NO_SOCKET;
+ handle->inStart = handle->inEnd = handle->inCursor = 0;
+ handle->outEnd = 0;
+ }
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ handle = &dn_handles[i];
+ if (handle->sock != NO_SOCKET)
+ result = true;
+ handle->sock = NO_SOCKET;
+ handle->inStart = handle->inEnd = handle->inCursor = 0;
+ handle->outEnd = 0;
+ }
+
+ InitMultinodeExecutor(true);
+
+ return result;
+}
+
+/*
+ * Diff handles using shmem, and remove ALTERed handles
+ */
+static bool
+DoRefreshRemoteHandles(void)
+{
+ List *altered = NIL, *deleted = NIL, *added = NIL;
+ Oid *coOids, *dnOids;
+ int numCoords, numDNodes, total_nodes;
+ bool res = true;
+
+ HandlesRefreshPending = false;
+
+ PgxcNodeGetOids(&coOids, &dnOids, &numCoords, &numDNodes, false);
+
+ total_nodes = numCoords + numDNodes;
+ if (total_nodes > 0)
+ {
+ int i;
+ List *shmoids = NIL;
+ Oid *allOids = (Oid *)palloc(total_nodes * sizeof(Oid));
+
+ /* build array with Oids of all nodes (coordinators first) */
+ memcpy(allOids, coOids, numCoords * sizeof(Oid));
+ memcpy(allOids + numCoords, dnOids, numDNodes * sizeof(Oid));
+
+ LWLockAcquire(NodeTableLock, LW_SHARED);
+
+ for (i = 0; i < total_nodes; i++)
+ {
+ NodeDefinition *nodeDef;
+ PGXCNodeHandle *handle;
+
+ int nid;
+ Oid nodeoid;
+ char ntype = PGXC_NODE_NONE;
+
+ nodeoid = allOids[i];
+ shmoids = lappend_oid(shmoids, nodeoid);
+
+ nodeDef = PgxcNodeGetDefinition(nodeoid);
+ /*
+ * identify an entry with this nodeoid. If found
+ * compare the name/host/port entries. If the name is
+ * same and other info is different, it's an ALTER.
+ * If the local entry does not exist in the shmem, it's
+ * a DELETE. If the entry from shmem does not exist
+ * locally, it's an ADDITION
+ */
+ nid = PGXCNodeGetNodeId(nodeoid, &ntype);
+
+ if (nid == -1)
+ {
+ /* a new node has been added to the shmem */
+ added = lappend_oid(added, nodeoid);
+ elog(LOG, "Node added: name (%s) host (%s) port (%d)",
+ NameStr(nodeDef->nodename), NameStr(nodeDef->nodehost),
+ nodeDef->nodeport);
+ }
+ else
+ {
+ if (ntype == PGXC_NODE_COORDINATOR)
+ handle = &co_handles[nid];
+ else if (ntype == PGXC_NODE_DATANODE)
+ handle = &dn_handles[nid];
+ else
+ elog(ERROR, "Node with non-existent node type!");
+
+ /*
+ * compare name, host, port to see if this node
+ * has been ALTERed
+ */
+ if (strncmp(handle->nodename, NameStr(nodeDef->nodename), NAMEDATALEN) != 0 ||
+ strncmp(handle->nodehost, NameStr(nodeDef->nodehost), NAMEDATALEN) != 0 ||
+ handle->nodeport != nodeDef->nodeport)
+ {
+ elog(LOG, "Node altered: old name (%s) old host (%s) old port (%d)"
+ " new name (%s) new host (%s) new port (%d)",
+ handle->nodename, handle->nodehost, handle->nodeport,
+ NameStr(nodeDef->nodename), NameStr(nodeDef->nodehost),
+ nodeDef->nodeport);
+ altered = lappend_oid(altered, nodeoid);
+ }
+ /* else do nothing */
+ }
+ pfree(nodeDef);
+ }
+
+ /*
+ * Any entry in backend area but not in shmem means that it has
+ * been deleted
+ */
+ for (i = 0; i < NumCoords; i++)
+ {
+ PGXCNodeHandle *handle = &co_handles[i];
+ Oid nodeoid = handle->nodeoid;
+
+ if (!list_member_oid(shmoids, nodeoid))
+ {
+ deleted = lappend_oid(deleted, nodeoid);
+ elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ handle->nodename, handle->nodehost, handle->nodeport);
+ }
+ }
+
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ PGXCNodeHandle *handle = &dn_handles[i];
+ Oid nodeoid = handle->nodeoid;
+
+ if (!list_member_oid(shmoids, nodeoid))
+ {
+ deleted = lappend_oid(deleted, nodeoid);
+ elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ handle->nodename, handle->nodehost, handle->nodeport);
+ }
+ }
+
+ LWLockRelease(NodeTableLock);
+
+ /* Release palloc'ed memory */
+ pfree(coOids);
+ pfree(dnOids);
+ pfree(allOids);
+ list_free(shmoids);
+ }
+
+ if (deleted != NIL || added != NIL)
+ {
+ elog(LOG, "Nodes added/deleted. Reload needed!");
+ res = false;
+ }
+
+ if (altered == NIL)
+ {
+ elog(LOG, "No nodes altered. Returning");
+ res = true;
+ }
+ else
+ PgxcNodeRefreshBackendHandlesShmem(altered);
+
+ list_free(altered);
+ list_free(added);
+ list_free(deleted);
+
+ return res;
+}
+
+void
+PGXCNodeSetConnectionState(PGXCNodeHandle *handle, DNConnectionState new_state)
+{
+ elog(DEBUG5, "Changing connection state for node %s, old state %d, "
+ "new state %d", handle->nodename, handle->state, new_state);
+ handle->state = new_state;
+}
+
+/*
+ * Do a "Diff" of backend NODE metadata and the one present in catalog
+ *
+ * We do this in order to identify if we should do a destructive
+ * cleanup or just invalidation of some specific handles
+ */
+bool
+PgxcNodeDiffBackendHandles(List **nodes_alter,
+ List **nodes_delete, List **nodes_add)
+{
+ Relation rel;
+ HeapScanDesc scan;
+ HeapTuple tuple;
+ int i;
+ List *altered = NIL, *added = NIL, *deleted = NIL;
+ List *catoids = NIL;
+ PGXCNodeHandle *handle;
+ Oid nodeoid;
+ bool res = true;
+
+ LWLockAcquire(NodeTableLock, LW_SHARED);
+
+ rel = heap_open(PgxcNodeRelationId, AccessShareLock);
+ scan = heap_beginscan(rel, SnapshotSelf, 0, NULL);
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ Form_pgxc_node nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ int nid;
+ Oid nodeoid;
+ char ntype = PGXC_NODE_NONE;
+
+ nodeoid = HeapTupleGetOid(tuple);
+ catoids = lappend_oid(catoids, nodeoid);
+
+ /*
+ * identify an entry with this nodeoid. If found
+ * compare the name/host/port entries. If the name is
+ * same and other info is different, it's an ALTER.
+ * If the local entry does not exist in the catalog, it's
+ * a DELETE. If the entry from catalog does not exist
+ * locally, it's an ADDITION
+ */
+ nid = PGXCNodeGetNodeId(nodeoid, &ntype);
+
+ if (nid == -1)
+ {
+ /* a new node has been added to the catalog */
+ added = lappend_oid(added, nodeoid);
+ elog(LOG, "Node added: name (%s) host (%s) port (%d)",
+ NameStr(nodeForm->node_name), NameStr(nodeForm->node_host),
+ nodeForm->node_port);
+ }
+ else
+ {
+ if (ntype == PGXC_NODE_COORDINATOR)
+ handle = &co_handles[nid];
+ else if (ntype == PGXC_NODE_DATANODE)
+ handle = &dn_handles[nid];
+ else
+ elog(ERROR, "Node with non-existent node type!");
+
+ /*
+ * compare name, host, port to see if this node
+ * has been ALTERed
+ */
+ if (strncmp(handle->nodename, NameStr(nodeForm->node_name), NAMEDATALEN)
+ != 0 ||
+ strncmp(handle->nodehost, NameStr(nodeForm->node_host), NAMEDATALEN)
+ != 0 ||
+ handle->nodeport != nodeForm->node_port)
+ {
+ elog(LOG, "Node altered: old name (%s) old host (%s) old port (%d)"
+ " new name (%s) new host (%s) new port (%d)",
+ handle->nodename, handle->nodehost, handle->nodeport,
+ NameStr(nodeForm->node_name), NameStr(nodeForm->node_host),
+ nodeForm->node_port);
+ /*
+ * If this node itself is being altered, then we need to
+ * resort to a reload. Check so..
+ */
+ if (pg_strcasecmp(PGXCNodeName,
+ NameStr(nodeForm->node_name)) == 0)
+ {
+ res = false;
+ }
+ altered = lappend_oid(altered, nodeoid);
+ }
+ /* else do nothing */
+ }
+ }
+ heap_endscan(scan);
+
+ /*
+ * Any entry in backend area but not in catalog means that it has
+ * been deleted
+ */
+ for (i = 0; i < NumCoords; i++)
+ {
+ handle = &co_handles[i];
+ nodeoid = handle->nodeoid;
+ if (!list_member_oid(catoids, nodeoid))
+ {
+ deleted = lappend_oid(deleted, nodeoid);
+ elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ handle->nodename, handle->nodehost, handle->nodeport);
+ }
+ }
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ handle = &dn_handles[i];
+ nodeoid = handle->nodeoid;
+ if (!list_member_oid(catoids, nodeoid))
+ {
+ deleted = lappend_oid(deleted, nodeoid);
+ elog(LOG, "Node deleted: name (%s) host (%s) port (%d)",
+ handle->nodename, handle->nodehost, handle->nodeport);
+ }
+ }
+ heap_close(rel, AccessShareLock);
+ LWLockRelease(NodeTableLock);
+
+ if (nodes_alter)
+ *nodes_alter = altered;
+ if (nodes_delete)
+ *nodes_delete = deleted;
+ if (nodes_add)
+ *nodes_add = added;
+
+ if (catoids)
+ list_free(catoids);
+
+ return res;
+}
+
+/*
+ * Refresh specific backend handles associated with
+ * nodes in the "nodes_alter" list below
+ *
+ * The handles are refreshed using shared memory
+ */
+void
+PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter)
+{
+ ListCell *lc;
+ Oid nodeoid;
+ int nid;
+ PGXCNodeHandle *handle = NULL;
+
+ foreach(lc, nodes_alter)
+ {
+ char ntype = PGXC_NODE_NONE;
+ NodeDefinition *nodedef;
+
+ nodeoid = lfirst_oid(lc);
+ nid = PGXCNodeGetNodeId(nodeoid, &ntype);
+
+ if (nid == -1)
+ elog(ERROR, "Looks like node metadata changed again");
+ else
+ {
+ if (ntype == PGXC_NODE_COORDINATOR)
+ handle = &co_handles[nid];
+ else if (ntype == PGXC_NODE_DATANODE)
+ handle = &dn_handles[nid];
+ else
+ elog(ERROR, "Node with non-existent node type!");
+ }
+
+ /*
+ * Update the local backend handle data with data from catalog
+ * Free the handle first..
+ */
+ pgxc_node_free(handle);
+ elog(LOG, "Backend (%u), Node (%s) updated locally",
+ MyBackendId, handle->nodename);
+ nodedef = PgxcNodeGetDefinition(nodeoid);
+ strncpy(handle->nodename, NameStr(nodedef->nodename), NAMEDATALEN);
+ strncpy(handle->nodehost, NameStr(nodedef->nodehost), NAMEDATALEN);
+ handle->nodeport = nodedef->nodeport;
+ pfree(nodedef);
+ }
+ return;
+}
+
+void
+HandlePoolerMessages(void)
+{
+ if (HandlesRefreshPending)
+ {
+ DoRefreshRemoteHandles();
+
+ elog(LOG, "Backend (%u), doing handles refresh",
+ MyBackendId);
+ }
+ return;
+}
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * poolmgr.c
+ *
+ * Connection pool manager handles connections to Datanodes
+ *
+ * The pooler runs as a separate process and is forked off from a
+ * Coordinator postmaster. If the Coordinator needs a connection from a
+ * Datanode, it asks for one from the pooler, which maintains separate
+ * pools for each Datanode. A group of connections can be requested in
+ * a single request, and the pooler returns a list of file descriptors
+ * to use for the connections.
+ *
+ * Note the current implementation does not yet shrink the pool over time
+ * as connections are idle. Also, it does not queue requests; if a
+ * connection is unavailable, it will simply fail. This should be implemented
+ * one day, although there is a chance for deadlocks. For now, limiting
+ * connections should be done between the application and Coordinator.
+ * Still, this is useful to avoid having to re-establish connections to the
+ * Datanodes all the time for multiple Coordinator backend sessions.
+ *
+ * The term "agent" here refers to a session manager, one for each backend
+ * Coordinator connection to the pooler. It will contain a list of connections
+ * allocated to a session, at most one per Datanode.
+ *
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <poll.h>
++#include <math.h>
+
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "catalog/pgxc_node.h"
+#include "commands/dbcommands.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "nodes/nodes.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/lsyscache.h"
+#include "utils/resowner.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "pgxc/locator.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/pause.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/poolmgr.h"
+#include "pgxc/poolutils.h"
+#include "postmaster/postmaster.h" /* For UnixSocketDir */
+#include "storage/procarray.h"
++#include "utils/varlena.h"
+
+#include "../interfaces/libpq/libpq-fe.h"
+#include "../interfaces/libpq/libpq-int.h"
+
++
+/* Configuration options */
+int PoolConnKeepAlive = 600;
+int PoolMaintenanceTimeout = 30;
+int MaxPoolSize = 100;
+int PoolerPort = 6667;
+
+bool PersistentConnections = false;
+
+/* Flag to tell if we are Postgres-XC pooler process */
+static bool am_pgxc_pooler = false;
+
+/* Connection information cached */
+typedef struct
+{
+ Oid nodeoid;
+ char *host;
+ int port;
+} PGXCNodeConnectionInfo;
+
+/* Handle to the pool manager (Session's side) */
+typedef struct
+{
+ /* communication channel */
+ PoolPort port;
+} PoolHandle;
+
+/* The root memory context */
+static MemoryContext PoolerMemoryContext = NULL;
+/*
+ * Allocations of core objects: Datanode connections, upper level structures,
+ * connection strings, etc.
+ */
+static MemoryContext PoolerCoreContext = NULL;
+/*
+ * Memory to store Agents
+ */
+static MemoryContext PoolerAgentContext = NULL;
+
+/* Pool to all the databases (linked list) */
+static DatabasePool *databasePools = NULL;
+
+/* PoolAgents and the poll array*/
+static int agentCount = 0;
+static PoolAgent **poolAgents;
+
+static PoolHandle *poolHandle = NULL;
+
+static int is_pool_locked = false;
+static int server_fd = -1;
+
+static int node_info_check(PoolAgent *agent);
+static void agent_init(PoolAgent *agent, const char *database, const char *user_name,
+ const char *pgoptions);
+static void agent_destroy(PoolAgent *agent);
+static void agent_create(void);
+static void agent_handle_input(PoolAgent *agent, StringInfo s);
+static DatabasePool *create_database_pool(const char *database, const char *user_name, const char *pgoptions);
+static void insert_database_pool(DatabasePool *pool);
+static int destroy_database_pool(const char *database, const char *user_name);
+static void reload_database_pools(PoolAgent *agent);
+static int refresh_database_pools(PoolAgent *agent);
+static bool remove_all_agent_references(Oid nodeoid);
+static DatabasePool *find_database_pool(const char *database, const char *user_name, const char *pgoptions);
+static DatabasePool *remove_database_pool(const char *database, const char *user_name);
+static int *agent_acquire_connections(PoolAgent *agent, List *datanodelist,
+ List *coordlist, int **connectionpids);
+static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist);
+static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node);
+static void agent_release_connections(PoolAgent *agent, bool force_destroy);
+static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
+ Oid node, bool force_destroy);
+static void destroy_slot(PGXCNodePoolSlot *slot);
+static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
+static void destroy_node_pool(PGXCNodePool *node_pool);
+static void PoolerLoop(void);
+static int clean_connection(List *node_discard,
+ const char *database,
+ const char *user_name);
+static int *abort_pids(int *count,
+ int pid,
+ const char *database,
+ const char *user_name);
+static char *build_node_conn_str(Oid node, DatabasePool *dbPool);
+/* Signal handlers */
+static void pooler_die(SIGNAL_ARGS);
+static void pooler_quickdie(SIGNAL_ARGS);
+static void PoolManagerConnect(const char *database, const char *user_name,
+ const char *pgoptions);
+static void pooler_sighup(SIGNAL_ARGS);
+static bool shrink_pool(DatabasePool *pool);
+static void pools_maintenance(void);
+static void TryPingUnhealthyNode(Oid nodeoid);
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t shutdown_requested = false;
+
+void
+PGXCPoolerProcessIam(void)
+{
+ am_pgxc_pooler = true;
+}
+
+bool
+IsPGXCPoolerProcess(void)
+{
+ return am_pgxc_pooler;
+}
+
+/*
+ * Initialize internal structures
+ */
+int
+PoolManagerInit()
+{
+ elog(DEBUG1, "Pooler process is started: %d", getpid());
+
+ /*
+ * Set up memory contexts for the pooler objects
+ */
+ PoolerMemoryContext = AllocSetContextCreate(TopMemoryContext,
+ "PoolerMemoryContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ PoolerCoreContext = AllocSetContextCreate(PoolerMemoryContext,
+ "PoolerCoreContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ PoolerAgentContext = AllocSetContextCreate(PoolerMemoryContext,
+ "PoolerAgentContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ ForgetLockFiles();
+
+ /*
+ * Properly accept or ignore signals the postmaster might send us
+ */
+ pqsignal(SIGINT, pooler_die);
+ pqsignal(SIGTERM, pooler_die);
+ pqsignal(SIGQUIT, pooler_quickdie);
+ pqsignal(SIGHUP, pooler_sighup);
+ /* TODO other signal handlers */
+
+ /* We allow SIGQUIT (quickdie) at all times */
+ sigdelset(&BlockSig, SIGQUIT);
+
+ /*
+ * Unblock signals (they were blocked when the postmaster forked us)
+ */
+ PG_SETMASK(&UnBlockSig);
+
+ /* Allocate pooler structures in the Pooler context */
+ MemoryContextSwitchTo(PoolerMemoryContext);
+
+ poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
+ if (poolAgents == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory while initializing pool agents")));
+ }
+
+ PoolerLoop();
+ return 0;
+}
+
+
+/*
+ * Check connection info consistency with system catalogs
+ */
+static int
+node_info_check(PoolAgent *agent)
+{
+ DatabasePool *dbPool = databasePools;
+ List *checked = NIL;
+ int res = POOL_CHECK_SUCCESS;
+ Oid *coOids;
+ Oid *dnOids;
+ int numCo;
+ int numDn;
+
+ /*
+ * First check if agent's node information matches to current content of the
+ * shared memory table.
+ */
+ PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
+
+ if (agent->num_coord_connections != numCo ||
+ agent->num_dn_connections != numDn ||
+ memcmp(agent->coord_conn_oids, coOids, numCo * sizeof(Oid)) ||
+ memcmp(agent->dn_conn_oids, dnOids, numDn * sizeof(Oid)))
+ res = POOL_CHECK_FAILED;
+
+ /* Release palloc'ed memory */
+ pfree(coOids);
+ pfree(dnOids);
+
+ /*
+ * Iterate over all dbnode pools and check if connection strings
+ * are matching node definitions.
+ */
+ while (res == POOL_CHECK_SUCCESS && dbPool)
+ {
+ HASH_SEQ_STATUS hseq_status;
+ PGXCNodePool *nodePool;
+
+ hash_seq_init(&hseq_status, dbPool->nodePools);
+ while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ {
+ char *connstr_chk;
+
+ /* No need to check same Datanode twice */
+ if (list_member_oid(checked, nodePool->nodeoid))
+ continue;
+ checked = lappend_oid(checked, nodePool->nodeoid);
+
+ connstr_chk = build_node_conn_str(nodePool->nodeoid, dbPool);
+ if (connstr_chk == NULL)
+ {
+ /* Problem of constructing connection string */
+ hash_seq_term(&hseq_status);
+ res = POOL_CHECK_FAILED;
+ break;
+ }
+ /* return error if there is difference */
+ if (strcmp(connstr_chk, nodePool->connstr))
+ {
+ pfree(connstr_chk);
+ hash_seq_term(&hseq_status);
+ res = POOL_CHECK_FAILED;
+ break;
+ }
+
+ pfree(connstr_chk);
+ }
+ dbPool = dbPool->next;
+ }
+ list_free(checked);
+ return res;
+}
+
+/*
+ * Destroy internal structures
+ */
+int
+PoolManagerDestroy(void)
+{
+ int status = 0;
+
+ if (PoolerMemoryContext)
+ {
+ MemoryContextDelete(PoolerMemoryContext);
+ PoolerMemoryContext = NULL;
+ }
+
+ return status;
+}
+
+/*
+ * Connect to the pooler process
+ */
+static void
+GetPoolManagerHandle(void)
+{
+ PoolHandle *handle;
+ int fdsock = -1;
+
+ if (poolHandle)
+ /* already connected */
+ return;
+
+#ifdef HAVE_UNIX_SOCKETS
+ if (Unix_socket_directories)
+ {
+ char *rawstring;
+ List *elemlist;
+ ListCell *l;
+ int success = 0;
+
+ /* Need a modifiable copy of Unix_socket_directories */
+ rawstring = pstrdup(Unix_socket_directories);
+
+ /* Parse string into list of directories */
+ if (!SplitDirectoriesString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax in parameter \"%s\"",
+ "unix_socket_directories")));
+ }
+
+ foreach(l, elemlist)
+ {
+ char *socketdir = (char *) lfirst(l);
+ int saved_errno;
+
+ /* Connect to the pooler */
+ fdsock = pool_connect(PoolerPort, socketdir);
+ if (fdsock < 0)
+ {
+ saved_errno = errno;
+ ereport(WARNING,
+ (errmsg("could not create Unix-domain socket in directory \"%s\", errno: %d",
+ socketdir, saved_errno)));
+ }
+ else
+ {
+ success++;
+ break;
+ }
+ }
+
+ if (!success && elemlist != NIL)
+ ereport(ERROR,
+ (errmsg("failed to connect to pool manager: %m")));
+
+ list_free_deep(elemlist);
+ pfree(rawstring);
+ }
+#endif
+
+ /*
+ * Actual connection errors should be reported by the block above,
+ * but perhaps we haven't actually executed it - either because
+ * the Unix_socket_directories is not set, or because there's no
+ * support for UNIX_SOCKETS. Just bail out in that case.
+ */
+ if (fdsock < 0)
+ ereport(ERROR,
+ (errmsg("failed to connect to pool manager: %m")));
+
+ /*
+ * Allocate handle
+ *
+ * XXX we may change malloc here to palloc but first ensure
+ * the CurrentMemoryContext is properly set.
+ * The handle allocated just before new session is forked off and
+ * inherited by the session process. It should remain valid for all
+ * the session lifetime.
+ */
+ handle = (PoolHandle *) malloc(sizeof(PoolHandle));
+ if (!handle)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
+ handle->port.fdsock = fdsock;
+ handle->port.RecvLength = 0;
+ handle->port.RecvPointer = 0;
+ handle->port.SendPointer = 0;
+
+ poolHandle = handle;
+}
+
+/*
+ * Create agent
+ */
+static void
+agent_create(void)
+{
+ MemoryContext oldcontext;
+ int new_fd;
+ PoolAgent *agent;
+
+ new_fd = accept(server_fd, NULL, NULL);
+ if (new_fd < 0)
+ {
+ int saved_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("pool manager failed to accept connection: %m")));
+ errno = saved_errno;
+ return;
+ }
+
+ oldcontext = MemoryContextSwitchTo(PoolerAgentContext);
+
+ /* Allocate agent */
+ agent = (PoolAgent *) palloc(sizeof(PoolAgent));
+ if (!agent)
+ {
+ close(new_fd);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return;
+ }
+
+ agent->port.fdsock = new_fd;
+ agent->port.RecvLength = 0;
+ agent->port.RecvPointer = 0;
+ agent->port.SendPointer = 0;
+ agent->pool = NULL;
+ agent->mcxt = AllocSetContextCreate(CurrentMemoryContext,
+ "Agent",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ agent->num_dn_connections = 0;
+ agent->num_coord_connections = 0;
+ agent->dn_conn_oids = NULL;
+ agent->coord_conn_oids = NULL;
+ agent->dn_connections = NULL;
+ agent->coord_connections = NULL;
+ agent->pid = 0;
+
+ /* Append new agent to the list */
+ poolAgents[agentCount++] = agent;
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+
+/*
+ * session_options
+ * Returns the pgoptions string generated using a particular
+ * list of parameters that are required to be propagated to Datanodes.
+ * These parameters then become default values for the pooler sessions.
+ * For e.g., a psql user sets PGDATESTYLE. This value should be set
+ * as the default connection parameter in the pooler session that is
+ * connected to the Datanodes. There are various parameters which need to
+ * be analysed individually to determine whether these should be set on
+ * Datanodes.
+ *
+ * Note: These parameters values are the default values of the particular
+ * Coordinator backend session, and not the new values set by SET command.
+ *
+ */
+
+char *session_options(void)
+{
+ int i;
+ char *pgoptions[] = {"DateStyle", "timezone", "geqo", "intervalstyle", "lc_monetary"};
+ StringInfoData options;
+ List *value_list;
+ ListCell *l;
+
+ initStringInfo(&options);
+
+ for (i = 0; i < sizeof(pgoptions)/sizeof(char*); i++)
+ {
+ const char *value;
+
+ appendStringInfo(&options, " -c %s=", pgoptions[i]);
+
+ value = GetConfigOptionResetString(pgoptions[i]);
+
+ /* lc_monetary does not accept lower case values */
+ if (strcmp(pgoptions[i], "lc_monetary") == 0)
+ {
+ appendStringInfoString(&options, value);
+ continue;
+ }
+
+ SplitIdentifierString(strdup(value), ',', &value_list);
+ foreach(l, value_list)
+ {
+ char *value = (char *) lfirst(l);
+ appendStringInfoString(&options, value);
+ if (lnext(l))
+ appendStringInfoChar(&options, ',');
+ }
+ }
+
+ return options.data;
+}
+
+
+/*
+ * Associate session with specified database and respective connection pool
+ * Invoked from Session process
+ */
+static void
+PoolManagerConnect(const char *database, const char *user_name,
+ const char *pgoptions)
+{
+ int n32;
+ char msgtype = 'c';
+ int unamelen = strlen(user_name);
+ int dbnamelen = strlen(database);
+ int pgoptionslen = strlen(pgoptions);
+ char atchar = ' ';
+
+ /* Connect to the pooler process if not yet connected */
+ GetPoolManagerHandle();
+ if (poolHandle == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to connect to the pooler process")));
+
+ elog(DEBUG1, "Connecting to PoolManager (user_name %s, database %s, "
+ "pgoptions %s", user_name, database, pgoptions);
+
+ /*
+ * Special handling for db_user_namespace=on
+ * We need to handle per-db users and global users. The per-db users will
+ * arrive with @dbname and global users just as username. Handle both of
+ * them appropriately
+ */
+ if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0)
+ {
+ if (strchr(user_name, '@') != NULL)
+ {
+ Assert(unamelen > dbnamelen + 1);
+ unamelen -= (dbnamelen + 1);
+ }
+ else
+ {
+ atchar = '@';
+ unamelen++;
+ }
+ }
+
+ /* Message type */
+ pool_putbytes(&poolHandle->port, &msgtype, 1);
+
+ /* Message length */
+ n32 = htonl(dbnamelen + unamelen + pgoptionslen + 23);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* PID number */
+ n32 = htonl(MyProcPid);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Length of Database string */
+ n32 = htonl(dbnamelen + 1);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send database name followed by \0 terminator */
+ pool_putbytes(&poolHandle->port, database, dbnamelen);
+ pool_putbytes(&poolHandle->port, "\0", 1);
+
+ /* Length of user name string */
+ n32 = htonl(unamelen + 1);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send user name followed by \0 terminator */
+ /* Send the '@' char if needed. Already accounted for in len */
+ if (atchar == '@')
+ {
+ pool_putbytes(&poolHandle->port, user_name, unamelen - 1);
+ pool_putbytes(&poolHandle->port, "@", 1);
+ }
+ else
+ pool_putbytes(&poolHandle->port, user_name, unamelen);
+ pool_putbytes(&poolHandle->port, "\0", 1);
+
+ /* Length of pgoptions string */
+ n32 = htonl(pgoptionslen + 1);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send pgoptions followed by \0 terminator */
+ pool_putbytes(&poolHandle->port, pgoptions, pgoptionslen);
+ pool_putbytes(&poolHandle->port, "\0", 1);
+ pool_flush(&poolHandle->port);
+}
+
+/*
+ * Reconnect to pool manager
+ * It simply does a disconnection and a reconnection.
+ */
+void
+PoolManagerReconnect(void)
+{
+ elog(DEBUG1, "Reconnecting to PoolManager");
+
+ /* Connected, disconnect */
+ if (poolHandle)
+ PoolManagerDisconnect();
+
+ PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName(),
+ session_options());
+}
+
+/*
+ * Lock/unlock pool manager
+ * During locking, the only operations not permitted are abort, connection and
+ * connection obtention.
+ */
+void
+PoolManagerLock(bool is_lock)
+{
+ char msgtype = 'o';
+ int n32;
+ int msglen = 8;
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName(), "");
+
+ elog(DEBUG1, "Locking PoolManager");
+
+ /* Message type */
+ pool_putbytes(&poolHandle->port, &msgtype, 1);
+
+ /* Message length */
+ n32 = htonl(msglen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Lock information */
+ n32 = htonl((int) is_lock);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ pool_flush(&poolHandle->port);
+}
+
+/*
+ * Init PoolAgent
+ */
+static void
+agent_init(PoolAgent *agent, const char *database, const char *user_name,
+ const char *pgoptions)
+{
+ MemoryContext oldcontext;
+
+ Assert(agent);
+ Assert(database);
+ Assert(user_name);
+
+ /* disconnect if we are still connected */
+ if (agent->pool)
+ agent_release_connections(agent, false);
+
+ oldcontext = MemoryContextSwitchTo(agent->mcxt);
+
+ /* Get needed info and allocate memory */
+ PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
+ &agent->num_coord_connections, &agent->num_dn_connections, false);
+
+ agent->coord_connections = (PGXCNodePoolSlot **)
+ palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
+ agent->dn_connections = (PGXCNodePoolSlot **)
+ palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
+ /* find database */
+ agent->pool = find_database_pool(database, user_name, pgoptions);
+
+ /* create if not found */
+ if (agent->pool == NULL)
+ agent->pool = create_database_pool(database, user_name, pgoptions);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return;
+}
+
+/*
+ * Destroy PoolAgent
+ */
+static void
+agent_destroy(PoolAgent *agent)
+{
+ int i;
+
+ Assert(agent);
+
+ close(Socket(agent->port));
+
+ /* Discard connections if any remaining */
+ if (agent->pool)
+ {
+ /*
+ * If session is disconnecting while there are active connections
+ * we can not know if they clean or not, so force destroy them
+ */
+ agent_release_connections(agent, true);
+ }
+
+ /* find agent in the list */
+ for (i = 0; i < agentCount; i++)
+ {
+ if (poolAgents[i] == agent)
+ {
+ /* Free memory. All connection slots are NULL at this point */
+ MemoryContextDelete(agent->mcxt);
+
+ pfree(agent);
+ /* shrink the list and move last agent into the freed slot */
+ if (i < --agentCount)
+ poolAgents[i] = poolAgents[agentCount];
+ /* only one match is expected so exit */
+ break;
+ }
+ }
+}
+
+/*
+ * Ping an UNHEALTHY node and if it succeeds, update SHARED node
+ * information
+ */
+static void
+TryPingUnhealthyNode(Oid nodeoid)
+{
+ int status;
+ NodeDefinition *nodeDef;
+ char connstr[MAXPGPATH * 2 + 256];
+
+ nodeDef = PgxcNodeGetDefinition(nodeoid);
+ if (nodeDef == NULL)
+ {
+ /* No such definition, node dropped? */
+ elog(DEBUG1, "Could not find node (%u) definition,"
+ " skipping health check", nodeoid);
+ return;
+ }
+ if (nodeDef->nodeishealthy)
+ {
+ /* hmm, can this happen? */
+ elog(DEBUG1, "node (%u) healthy!"
+ " skipping health check", nodeoid);
+ return;
+ }
+
+ elog(LOG, "node (%s:%u) down! Trying ping",
+ NameStr(nodeDef->nodename), nodeoid);
+ sprintf(connstr,
+ "host=%s port=%d", NameStr(nodeDef->nodehost),
+ nodeDef->nodeport);
+ status = PGXCNodePing(connstr);
+ if (status != 0)
+ {
+ pfree(nodeDef);
+ return;
+ }
+
+ elog(DEBUG1, "Node (%s) back online!", NameStr(nodeDef->nodename));
+ if (!PgxcNodeUpdateHealth(nodeoid, true))
+ elog(WARNING, "Could not update health status of node (%s)",
+ NameStr(nodeDef->nodename));
+ else
+ elog(LOG, "Health map updated to reflect HEALTHY node (%s)",
+ NameStr(nodeDef->nodename));
+ pfree(nodeDef);
+
+ return;
+}
+
+/*
+ * Check if a node is indeed down and if it is update its UNHEALTHY
+ * status
+ */
+void
+PoolPingNodeRecheck(Oid nodeoid)
+{
+ int status;
+ NodeDefinition *nodeDef;
+ char connstr[MAXPGPATH * 2 + 256];
+ bool healthy;
+
+ nodeDef = PgxcNodeGetDefinition(nodeoid);
+ if (nodeDef == NULL)
+ {
+ /* No such definition, node dropped? */
+ elog(DEBUG1, "Could not find node (%u) definition,"
+ " skipping health check", nodeoid);
+ return;
+ }
+
+ sprintf(connstr,
+ "host=%s port=%d", NameStr(nodeDef->nodehost),
+ nodeDef->nodeport);
+ status = PGXCNodePing(connstr);
+ healthy = (status == 0);
+
+ /* if no change in health bit, return */
+ if (healthy == nodeDef->nodeishealthy)
+ {
+ pfree(nodeDef);
+ return;
+ }
+
+ if (!PgxcNodeUpdateHealth(nodeoid, healthy))
+ elog(WARNING, "Could not update health status of node (%s)",
+ NameStr(nodeDef->nodename));
+ else
+ elog(LOG, "Health map updated to reflect (%s) node (%s)",
+ healthy ? "HEALTHY" : "UNHEALTHY", NameStr(nodeDef->nodename));
+ pfree(nodeDef);
+
+ return;
+}
+
+/*
+ * Ping UNHEALTHY nodes as part of the maintenance window
+ */
+void
+PoolPingNodes()
+{
+ Oid coOids[MaxCoords];
+ Oid dnOids[MaxDataNodes];
+ bool coHealthMap[MaxCoords];
+ bool dnHealthMap[MaxDataNodes];
+ int numCo;
+ int numDn;
+ int i;
+
+ PgxcNodeGetHealthMap(coOids, dnOids, &numCo, &numDn,
+ coHealthMap, dnHealthMap);
+
+ /*
+ * Find unhealthy datanodes and try to re-ping them
+ */
+ for (i = 0; i < numDn; i++)
+ {
+ if (!dnHealthMap[i])
+ {
+ Oid nodeoid = dnOids[i];
+ TryPingUnhealthyNode(nodeoid);
+ }
+ }
+ /*
+ * Find unhealthy coordinators and try to re-ping them
+ */
+ for (i = 0; i < numCo; i++)
+ {
+ if (!coHealthMap[i])
+ {
+ Oid nodeoid = coOids[i];
+ TryPingUnhealthyNode(nodeoid);
+ }
+ }
+}
+
+/*
+ * Release handle to pool manager
+ */
+void
+PoolManagerDisconnect(void)
+{
+ if (!poolHandle)
+ return; /* not even connected */
+
+ pool_putmessage(&poolHandle->port, 'd', NULL, 0);
+ pool_flush(&poolHandle->port);
+
+ close(Socket(poolHandle->port));
+ free(poolHandle);
+ poolHandle = NULL;
+}
+
+
+/*
+ * Get pooled connections
+ */
+int *
+PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
+{
+ int i;
+ ListCell *nodelist_item;
+ int *fds;
+ int totlen = list_length(datanodelist) + list_length(coordlist);
+ int nodes[totlen + 2];
+
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName(), session_options());
+
+ /*
+ * Prepare end send message to pool manager.
+ * First with Datanode list.
+ * This list can be NULL for a query that does not need
+ * Datanode Connections (Sequence DDLs)
+ */
+ nodes[0] = htonl(list_length(datanodelist));
+ i = 1;
+ if (list_length(datanodelist) != 0)
+ {
+ foreach(nodelist_item, datanodelist)
+ {
+ nodes[i++] = htonl(lfirst_int(nodelist_item));
+ }
+ }
+ /* Then with Coordinator list (can be nul) */
+ nodes[i++] = htonl(list_length(coordlist));
+ if (list_length(coordlist) != 0)
+ {
+ foreach(nodelist_item, coordlist)
+ {
+ nodes[i++] = htonl(lfirst_int(nodelist_item));
+ }
+ }
+
+ pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2));
+ pool_flush(&poolHandle->port);
+
+ /* Receive response */
+ fds = (int *) palloc(sizeof(int) * totlen);
+ if (fds == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ if (pool_recvfds(&poolHandle->port, fds, totlen))
+ {
+ pfree(fds);
+ fds = NULL;
+ }
+
+ if (pool_recvpids(&poolHandle->port, pids) != totlen)
+ {
+ pfree(*pids);
+ *pids = NULL;
+ return NULL;
+ }
+
+ return fds;
+}
+
+/*
+ * Abort active transactions using pooler.
+ * Take a lock forbidding access to Pooler for new transactions.
+ */
+int
+PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
+{
+ int num_proc_ids = 0;
+ int n32, msglen;
+ char msgtype = 'a';
+ int dblen = dbname ? strlen(dbname) + 1 : 0;
+ int userlen = username ? strlen(username) + 1 : 0;
+
+ /*
+ * New connection may be established to clean connections to
+ * specified nodes and databases.
+ */
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName(), session_options());
+
+ /* Message type */
+ pool_putbytes(&poolHandle->port, &msgtype, 1);
+
+ /* Message length */
+ msglen = dblen + userlen + 12;
+ n32 = htonl(msglen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Length of Database string */
+ n32 = htonl(dblen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send database name, followed by \0 terminator if necessary */
+ if (dbname)
+ pool_putbytes(&poolHandle->port, dbname, dblen);
+
+ /* Length of Username string */
+ n32 = htonl(userlen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send user name, followed by \0 terminator if necessary */
+ if (username)
+ pool_putbytes(&poolHandle->port, username, userlen);
+
+ pool_flush(&poolHandle->port);
+
+ /* Then Get back Pids from Pooler */
+ num_proc_ids = pool_recvpids(&poolHandle->port, proc_pids);
+
+ return num_proc_ids;
+}
+
+
+/*
+ * Clean up Pooled connections
+ */
+void
+PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username)
+{
+ int totlen = list_length(datanodelist) + list_length(coordlist);
+ int nodes[totlen + 2];
+ ListCell *nodelist_item;
+ int i, n32, msglen;
+ char msgtype = 'f';
+ int userlen = username ? strlen(username) + 1 : 0;
+ int dblen = dbname ? strlen(dbname) + 1 : 0;
+
+ /*
+ * New connection may be established to clean connections to
+ * specified nodes and databases.
+ */
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName(), session_options());
+
+ nodes[0] = htonl(list_length(datanodelist));
+ i = 1;
+ if (list_length(datanodelist) != 0)
+ {
+ foreach(nodelist_item, datanodelist)
+ {
+ nodes[i++] = htonl(lfirst_int(nodelist_item));
+ }
+ }
+ /* Then with Coordinator list (can be nul) */
+ nodes[i++] = htonl(list_length(coordlist));
+ if (list_length(coordlist) != 0)
+ {
+ foreach(nodelist_item, coordlist)
+ {
+ nodes[i++] = htonl(lfirst_int(nodelist_item));
+ }
+ }
+
+ /* Message type */
+ pool_putbytes(&poolHandle->port, &msgtype, 1);
+
+ /* Message length */
+ msglen = sizeof(int) * (totlen + 2) + dblen + userlen + 12;
+ n32 = htonl(msglen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send list of nodes */
+ pool_putbytes(&poolHandle->port, (char *) nodes, sizeof(int) * (totlen + 2));
+
+ /* Length of Database string */
+ n32 = htonl(dblen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send database name, followed by \0 terminator if necessary */
+ if (dbname)
+ pool_putbytes(&poolHandle->port, dbname, dblen);
+
+ /* Length of Username string */
+ n32 = htonl(userlen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send user name, followed by \0 terminator if necessary */
+ if (username)
+ pool_putbytes(&poolHandle->port, username, userlen);
+
+ pool_flush(&poolHandle->port);
+
+ /* Receive result message */
+ if (pool_recvres(&poolHandle->port) != CLEAN_CONNECTION_COMPLETED)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Clean connections not completed")));
+}
+
+
+/*
+ * Check connection information consistency cached in pooler with catalog information
+ */
+bool
+PoolManagerCheckConnectionInfo(void)
+{
+ int res;
+
+ /*
+ * New connection may be established to clean connections to
+ * specified nodes and databases.
+ */
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName(), session_options());
+ PgxcNodeListAndCount();
+ pool_putmessage(&poolHandle->port, 'q', NULL, 0);
+ pool_flush(&poolHandle->port);
+
+ res = pool_recvres(&poolHandle->port);
+
+ if (res == POOL_CHECK_SUCCESS)
+ return true;
+
+ return false;
+}
+
+
+/*
+ * Reload connection data in pooler and drop all the existing connections of pooler
+ */
+void
+PoolManagerReloadConnectionInfo(void)
+{
+ Assert(poolHandle);
+ PgxcNodeListAndCount();
+ pool_putmessage(&poolHandle->port, 'p', NULL, 0);
+ pool_flush(&poolHandle->port);
+}
+
+/*
+ * Refresh connection data in pooler and drop connections for those nodes
+ * that have changed. Thus, this operation is less destructive as compared
+ * to PoolManagerReloadConnectionInfo and should typically be called when
+ * NODE ALTER has been performed
+ */
+int
+PoolManagerRefreshConnectionInfo(void)
+{
+ int res;
+
+ Assert(poolHandle);
+ PgxcNodeListAndCount();
+ pool_putmessage(&poolHandle->port, 'R', NULL, 0);
+ pool_flush(&poolHandle->port);
+
+ res = pool_recvres(&poolHandle->port);
+
+ if (res == POOL_CHECK_SUCCESS)
+ return true;
+
+ return false;
+}
+
+static void
+handle_abort(PoolAgent * agent, StringInfo s)
+{
+ int len;
+ int *pids;
+ const char *database = NULL;
+ const char *user_name = NULL;
+
+ pool_getmessage(&agent->port, s, 0);
+ len = pq_getmsgint(s, 4);
+ if (len > 0)
+ database = pq_getmsgbytes(s, len);
+
+ len = pq_getmsgint(s, 4);
+ if (len > 0)
+ user_name = pq_getmsgbytes(s, len);
+
+ pq_getmsgend(s);
+
+ pids = abort_pids(&len, agent->pid, database, user_name);
+
+ pool_sendpids(&agent->port, pids, len);
+ if (pids)
+ pfree(pids);
+}
+
+static void
+handle_connect(PoolAgent * agent, StringInfo s)
+{
+ int len;
+ const char *database = NULL;
+ const char *user_name = NULL;
+ const char *pgoptions = NULL;
+
+ pool_getmessage(&agent->port, s, 0);
+ agent->pid = pq_getmsgint(s, 4);
+
+ len = pq_getmsgint(s, 4);
+ database = pq_getmsgbytes(s, len);
+
+ len = pq_getmsgint(s, 4);
+ user_name = pq_getmsgbytes(s, len);
+
+ len = pq_getmsgint(s, 4);
+ pgoptions = pq_getmsgbytes(s, len);
+
+ /*
+ * Coordinator pool is not initialized.
+ * With that it would be impossible to create a Database by default.
+ */
+ agent_init(agent, database, user_name, pgoptions);
+ pq_getmsgend(s);
+}
+
+static void
+handle_clean_connection(PoolAgent * agent, StringInfo s)
+{
+ int i, len, res;
+ int datanodecount, coordcount;
+ const char *database = NULL;
+ const char *user_name = NULL;
+ List *nodelist = NIL;
+
+ pool_getmessage(&agent->port, s, 0);
+
+ /* It is possible to clean up only datanode connections */
+ datanodecount = pq_getmsgint(s, 4);
+ for (i = 0; i < datanodecount; i++)
+ {
+ /* Translate index to Oid */
+ int index = pq_getmsgint(s, 4);
+ Oid node = agent->dn_conn_oids[index];
+ nodelist = lappend_oid(nodelist, node);
+ }
+
+ /* It is possible to clean up only coordinator connections */
+ coordcount = pq_getmsgint(s, 4);
+ for (i = 0; i < coordcount; i++)
+ {
+ /* Translate index to Oid */
+ int index = pq_getmsgint(s, 4);
+ Oid node = agent->coord_conn_oids[index];
+ nodelist = lappend_oid(nodelist, node);
+ }
+
+ len = pq_getmsgint(s, 4);
+ if (len > 0)
+ database = pq_getmsgbytes(s, len);
+
+ len = pq_getmsgint(s, 4);
+ if (len > 0)
+ user_name = pq_getmsgbytes(s, len);
+
+ pq_getmsgend(s);
+
+ /* Clean up connections here */
+ res = clean_connection(nodelist, database, user_name);
+
+ list_free(nodelist);
+
+ /* Send success result */
+ pool_sendres(&agent->port, res);
+}
+
+static void
+handle_get_connections(PoolAgent * agent, StringInfo s)
+{
+ int i;
+ int *fds, *pids = NULL;
+ int datanodecount, coordcount;
+ List *datanodelist = NIL;
+ List *coordlist = NIL;
+
+ /*
+ * Length of message is caused by:
+ * - Message header = 4bytes
+ * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
+ * - List of Coordinators = NumPoolCoords * 4bytes (max)
+ * - Number of Datanodes sent = 4bytes
+ * - Number of Coordinators sent = 4bytes
+ * It is better to send in a same message the list of Co and Dn at the same
+ * time, this permits to reduce interactions between postmaster and pooler
+ */
+ pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
+
+ datanodecount = pq_getmsgint(s, 4);
+ for (i = 0; i < datanodecount; i++)
+ datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
+
+ /* It is possible that no Coordinators are involved in the transaction */
+ coordcount = pq_getmsgint(s, 4);
+ for (i = 0; i < coordcount; i++)
+ coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
+
+ pq_getmsgend(s);
+
+ Assert(datanodecount >= 0 && coordcount >= 0);
+
+ /*
+ * In case of error agent_acquire_connections will log the error and
+ * return NULL.
+ */
+ fds = agent_acquire_connections(agent, datanodelist, coordlist, &pids);
+
+ list_free(datanodelist);
+ list_free(coordlist);
+
+ pool_sendfds(&agent->port, fds, fds ? datanodecount + coordcount : 0);
+ if (fds)
+ pfree(fds);
+
+ /*
+ * Also send the PIDs of the remote backend processes serving
+ * these connections
+ */
+ pool_sendpids(&agent->port, pids, pids ? datanodecount + coordcount : 0);
+ if (pids)
+ pfree(pids);
+}
+
+static void
+handle_query_cancel(PoolAgent * agent, StringInfo s)
+{
+ int i;
+ int datanodecount, coordcount;
+ List *datanodelist = NIL;
+ List *coordlist = NIL;
+
+ /*
+ * Length of message is caused by:
+ * - Message header = 4bytes
+ * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
+ * - List of Coordinators = NumPoolCoords * 4bytes (max)
+ * - Number of Datanodes sent = 4bytes
+ * - Number of Coordinators sent = 4bytes
+ */
+ pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
+
+ datanodecount = pq_getmsgint(s, 4);
+ for (i = 0; i < datanodecount; i++)
+ datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
+
+ coordcount = pq_getmsgint(s, 4);
+ /* It is possible that no Coordinators are involved in the transaction */
+ for (i = 0; i < coordcount; i++)
+ coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
+
+ pq_getmsgend(s);
+
+ cancel_query_on_connections(agent, datanodelist, coordlist);
+ list_free(datanodelist);
+ list_free(coordlist);
+
+ /* Send success result */
+ pool_sendres(&agent->port, QUERY_CANCEL_COMPLETED);
+}
+
+/*
+ * Handle messages to agent
+ */
+static void
+agent_handle_input(PoolAgent * agent, StringInfo s)
+{
+ /* read byte from the buffer (and recv if empty) */
+ int qtype = pool_getbyte(&agent->port);
+
+ /*
+ * We can have multiple messages, so handle them all
+ */
+ for (;;)
+ {
+ /*
+ * During a pool cleaning, Abort, Connect and Get Connections messages
+ * are not allowed on pooler side.
+ * It avoids to have new backends taking connections
+ * while remaining transactions are aborted during FORCE and then
+ * Pools are being shrinked.
+ */
+ if (is_pool_locked && (qtype == 'a' || qtype == 'c' || qtype == 'g'))
+ elog(WARNING,"Pool operation cannot run during pool lock");
+
+ elog(DEBUG1, "Pooler is handling command %c from %d", (char) qtype, agent->pid);
+
+ switch (qtype)
+ {
+ case 'a': /* ABORT */
+ handle_abort(agent, s);
+ break;
+ case 'c': /* CONNECT */
+ handle_connect(agent, s);
+ break;
+ case 'd': /* DISCONNECT */
+ pool_getmessage(&agent->port, s, 4);
+ agent_destroy(agent);
+ pq_getmsgend(s);
+ break;
+ case 'f': /* CLEAN CONNECTION */
+ handle_clean_connection(agent, s);
+ break;
+ case 'g': /* GET CONNECTIONS */
+ handle_get_connections(agent, s);
+ break;
+
+ case 'h': /* Cancel SQL Command in progress on specified connections */
+ handle_query_cancel(agent, s);
+ break;
+ case 'o': /* Lock/unlock pooler */
+ pool_getmessage(&agent->port, s, 8);
+ is_pool_locked = pq_getmsgint(s, 4);
+ pq_getmsgend(s);
+ break;
+ case 'p': /* Reload connection info */
+ pool_getmessage(&agent->port, s, 4);
+ pq_getmsgend(s);
+
+ /* First update all the pools */
+ reload_database_pools(agent);
+ break;
+ case 'R': /* Refresh connection info */
+ /*
+ */
+ pool_getmessage(&agent->port, s, 4);
+ pq_getmsgend(s);
+
+ pool_sendres(&agent->port, refresh_database_pools(agent));
+ break;
+ case 'P': /* Ping connection info */
+ /*
+ * Ping unhealthy nodes in the pools. If any of the
+ * nodes come up, update SHARED memory to
+ * indicate the same.
+ */
+ pool_getmessage(&agent->port, s, 4);
+ pq_getmsgend(s);
+
+ /* Ping all the pools */
+ PoolPingNodes();
+
+ break;
+ case 'q': /* Check connection info consistency */
+ pool_getmessage(&agent->port, s, 4);
+ pq_getmsgend(s);
+
+ /* Check cached info consistency */
+ pool_sendres(&agent->port, node_info_check(agent));
+ break;
+ case 'r': /* RELEASE CONNECTIONS */
+ {
+ bool destroy;
+
+ pool_getmessage(&agent->port, s, 8);
+ destroy = (bool) pq_getmsgint(s, 4);
+ pq_getmsgend(s);
+ agent_release_connections(agent, destroy);
+ }
+ break;
+ case EOF: /* EOF */
+ agent_destroy(agent);
+ return;
+ default: /* protocol violation */
+ agent_destroy(agent);
+ ereport(WARNING,
+ (errmsg("agent protocol violation, received byte %c", qtype)));
+ return;
+ }
+
+ /*
+ * check if there are more data in the buffer (but don't recv
+ * additional data), to avoid reading from a closed connection
+ *
+ * XXX I wonder whether this is correct, because it means we
+ * won't call agent_destroy() in this case (unlike when handling
+ * the message in the switch above).
+ */
+ if ((qtype = pool_pollbyte(&agent->port)) == EOF)
+ break;
+ }
+}
+
+/*
+ * acquire connection
+ */
+static int *
+agent_acquire_connections(PoolAgent *agent, List *datanodelist,
+ List *coordlist, int **pids)
+{
+ int i;
+ int *result;
+ ListCell *nodelist_item;
+ MemoryContext oldcontext;
+
+ Assert(agent);
+
+ /* Check if pooler can accept those requests */
+ if (list_length(datanodelist) > agent->num_dn_connections ||
+ list_length(coordlist) > agent->num_coord_connections)
+ {
+ elog(LOG, "agent_acquire_connections called with invalid arguments -"
+ "list_length(datanodelist) %d, num_dn_connections %d,"
+ "list_length(coordlist) %d, num_coord_connections %d",
+ list_length(datanodelist), agent->num_dn_connections,
+ list_length(coordlist), agent->num_coord_connections);
+ return NULL;
+ }
+
+ /*
+ * Allocate memory
+ * File descriptors of Datanodes and Coordinators are saved in the same array,
+ * This array will be sent back to the postmaster.
+ * It has a length equal to the length of the Datanode list
+ * plus the length of the Coordinator list.
+ * Datanode fds are saved first, then Coordinator fds are saved.
+ */
+ result = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
+ if (result == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ *pids = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
+ if (*pids == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /*
+ * There are possible memory allocations in the core pooler, we want
+ * these allocations in the contect of the database pool
+ */
+ oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
+
+
+ /* Initialize result */
+ i = 0;
+ /* Save in array fds of Datanodes first */
+ foreach(nodelist_item, datanodelist)
+ {
+ int node = lfirst_int(nodelist_item);
+
+ /* Acquire from the pool if none */
+ if (agent->dn_connections[node] == NULL)
+ {
+ PGXCNodePoolSlot *slot = acquire_connection(agent->pool,
+ agent->dn_conn_oids[node]);
+
+ /* Handle failure */
+ if (slot == NULL)
+ {
+ pfree(result);
+ MemoryContextSwitchTo(oldcontext);
+ elog(LOG, "Pooler could not open a connection to node %d",
+ agent->dn_conn_oids[node]);
+ return NULL;
+ }
+
+ /* Store in the descriptor */
+ agent->dn_connections[node] = slot;
+
+ /*
+ * Update newly-acquired slot with session parameters.
+ * Local parameters are fired only once BEGIN has been launched on
+ * remote nodes.
+ */
+ }
+
+ result[i] = PQsocket((PGconn *) agent->dn_connections[node]->conn);
+ (*pids)[i++] = ((PGconn *) agent->dn_connections[node]->conn)->be_pid;
+ }
+
+ /* Save then in the array fds for Coordinators */
+ foreach(nodelist_item, coordlist)
+ {
+ int node = lfirst_int(nodelist_item);
+
+ /* Acquire from the pool if none */
+ if (agent->coord_connections[node] == NULL)
+ {
+ PGXCNodePoolSlot *slot = acquire_connection(agent->pool, agent->coord_conn_oids[node]);
+
+ /* Handle failure */
+ if (slot == NULL)
+ {
+ pfree(result);
+ MemoryContextSwitchTo(oldcontext);
+ elog(LOG, "Pooler could not open a connection to node %d",
+ agent->coord_conn_oids[node]);
+ return NULL;
+ }
+
+ /* Store in the descriptor */
+ agent->coord_connections[node] = slot;
+
+ /*
+ * Update newly-acquired slot with session parameters.
+ * Local parameters are fired only once BEGIN has been launched on
+ * remote nodes.
+ */
+ }
+
+ result[i] = PQsocket((PGconn *) agent->coord_connections[node]->conn);
+ (*pids)[i++] = ((PGconn *) agent->coord_connections[node]->conn)->be_pid;
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return result;
+}
+
+/*
+ * Cancel query
+ */
+static int
+cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
+{
+ ListCell *nodelist_item;
+ char errbuf[256];
+ int nCount;
+ bool bRet;
+
+ nCount = 0;
+
+ if (agent == NULL)
+ return nCount;
+
+ /* Send cancel on Datanodes first */
+ foreach(nodelist_item, datanodelist)
+ {
+ int node = lfirst_int(nodelist_item);
+
+ if(node < 0 || node >= agent->num_dn_connections)
+ continue;
+
+ if (agent->dn_connections == NULL)
+ break;
+
+ if (!agent->dn_connections[node])
+ continue;
+
+ elog(DEBUG1, "Canceling query on connection to remote node %d, remote pid %d",
+ agent->dn_conn_oids[node],
+ ((PGconn *) agent->dn_connections[node]->conn)->be_pid);
+ bRet = PQcancel((PGcancel *) agent->dn_connections[node]->xc_cancelConn, errbuf, sizeof(errbuf));
+ if (bRet != false)
+ {
+ elog(DEBUG1, "Cancelled query on connection to remote node %d, remote pid %d",
+ agent->dn_conn_oids[node],
+ ((PGconn *) agent->dn_connections[node]->conn)->be_pid);
+ nCount++;
+ }
+ }
+
+ /* Send cancel to Coordinators too, e.g. if DDL was in progress */
+ foreach(nodelist_item, coordlist)
+ {
+ int node = lfirst_int(nodelist_item);
+
+ if(node < 0 || node >= agent->num_coord_connections)
+ continue;
+
+ if (agent->coord_connections == NULL)
+ break;
+
+ if (!agent->coord_connections[node])
+ continue;
+
+ elog(DEBUG1, "Canceling query on connection to remote node %d, remote pid %d",
+ agent->coord_conn_oids[node],
+ ((PGconn *) agent->coord_connections[node]->conn)->be_pid);
+ bRet = PQcancel((PGcancel *) agent->coord_connections[node]->xc_cancelConn, errbuf, sizeof(errbuf));
+ if (bRet != false)
+ {
+ elog(DEBUG1, "Cancelled query on connection to remote node %d, remote pid %d",
+ agent->coord_conn_oids[node],
+ ((PGconn *) agent->coord_connections[node]->conn)->be_pid);
+ nCount++;
+ }
+ }
+
+ return nCount;
+}
+
+/*
+ * Return connections back to the pool
+ */
+void
+PoolManagerReleaseConnections(bool force)
+{
+ char msgtype = 'r';
+ int n32;
+ int msglen = 8;
+
+ /* If disconnected from pooler all the connections already released */
+ if (!poolHandle)
+ return;
+
+ elog(DEBUG1, "Returning connections back to the pool");
+
+ /* Message type */
+ pool_putbytes(&poolHandle->port, &msgtype, 1);
+
+ /* Message length */
+ n32 = htonl(msglen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Lock information */
+ n32 = htonl((int) force);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ pool_flush(&poolHandle->port);
+}
+
+/*
+ * Cancel Query
+ */
+void
+PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
+{
+ uint32 n32;
+ /*
+ * Buffer contains the list of both Coordinator and Datanodes, as well
+ * as the number of connections
+ */
+ uint32 buf[2 + dn_count + co_count];
+ int i;
+
+ if (poolHandle == NULL)
+ return;
+
+ if (dn_count == 0 && co_count == 0)
+ return;
+
+ if (dn_count != 0 && dn_list == NULL)
+ return;
+
+ if (co_count != 0 && co_list == NULL)
+ return;
+
+ /* Insert the list of Datanodes in buffer */
+ n32 = htonl((uint32) dn_count);
+ buf[0] = n32;
+
+ for (i = 0; i < dn_count;)
+ {
+ n32 = htonl((uint32) dn_list[i++]);
+ buf[i] = n32;
+ }
+
+ /* Insert the list of Coordinators in buffer */
+ n32 = htonl((uint32) co_count);
+ buf[dn_count + 1] = n32;
+
+ /* Not necessary to send to pooler a request if there is no Coordinator */
+ if (co_count != 0)
+ {
+ for (i = dn_count + 1; i < (dn_count + co_count + 1);)
+ {
+ n32 = htonl((uint32) co_list[i - (dn_count + 1)]);
+ buf[++i] = n32;
+ }
+ }
+ pool_putmessage(&poolHandle->port, 'h', (char *) buf, (2 + dn_count + co_count) * sizeof(uint32));
+ pool_flush(&poolHandle->port);
+
+ /* Receive result message */
+ if (pool_recvres(&poolHandle->port) != QUERY_CANCEL_COMPLETED)
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Query cancel not completed")));
+}
+
+/*
+ * Release connections for Datanodes and Coordinators
+ */
+static void
+agent_release_connections(PoolAgent *agent, bool force_destroy)
+{
+ MemoryContext oldcontext;
+ int i;
+
+ if (!agent->dn_connections && !agent->coord_connections)
+ return;
+ if (!force_destroy && cluster_ex_lock_held)
+ {
+ elog(LOG, "Not releasing connection with cluster lock");
+ return;
+ }
+
+ /*
+ * There are possible memory allocations in the core pooler, we want
+ * these allocations in the contect of the database pool
+ */
+ oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
+
+ /*
+ * Remaining connections are assumed to be clean.
+ * First clean up for Datanodes
+ */
+ for (i = 0; i < agent->num_dn_connections; i++)
+ {
+ PGXCNodePoolSlot *slot = agent->dn_connections[i];
+
+ /*
+ * Release connection.
+ * If connection has temporary objects on it, destroy connection slot.
+ */
+ if (slot)
+ release_connection(agent->pool, slot, agent->dn_conn_oids[i], force_destroy);
+ agent->dn_connections[i] = NULL;
+ elog(DEBUG1, "Released connection to node %d", agent->dn_conn_oids[i]);
+ }
+ /* Then clean up for Coordinator connections */
+ for (i = 0; i < agent->num_coord_connections; i++)
+ {
+ PGXCNodePoolSlot *slot = agent->coord_connections[i];
+
+ /*
+ * Release connection.
+ * If connection has temporary objects on it, destroy connection slot.
+ */
+ if (slot)
+ release_connection(agent->pool, slot, agent->coord_conn_oids[i], force_destroy);
+ agent->coord_connections[i] = NULL;
+ elog(DEBUG1, "Released connection to node %d", agent->coord_conn_oids[i]);
+ }
+
+ /*
+ * Released connections are now in the pool and we may want to close
+ * them eventually. Update the oldest_idle value to reflect the latest
+ * last access time if not already updated..
+ */
+ if (!force_destroy && agent->pool->oldest_idle == (time_t) 0)
+ agent->pool->oldest_idle = time(NULL);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Create new empty pool for a database.
+ * By default Database Pools have a size null so as to avoid interactions
+ * between PGXC nodes in the cluster (Co/Co, Dn/Dn and Co/Dn).
+ * Pool is increased at the first GET_CONNECTION message received.
+ * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
+ * error and POOL_WEXIST if poll for this database already exist.
+ */
+static DatabasePool *
+create_database_pool(const char *database, const char *user_name, const char *pgoptions)
+{
+ MemoryContext oldcontext;
+ MemoryContext dbcontext;
+ DatabasePool *databasePool;
+ HASHCTL hinfo;
+
+ elog(DEBUG1, "Creating a connection pool for database %s, user %s,"
+ " with pgoptions %s", database, user_name, pgoptions);
+
+ dbcontext = AllocSetContextCreate(PoolerCoreContext,
+ "DB Context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ oldcontext = MemoryContextSwitchTo(dbcontext);
+ /* Allocate memory */
+ databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
+ if (!databasePool)
+ {
+ /* out of memory */
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return NULL;
+ }
+
+ databasePool->mcxt = dbcontext;
+ /* Copy the database name */
+ databasePool->database = pstrdup(database);
+ /* Copy the user name */
+ databasePool->user_name = pstrdup(user_name);
+ /* Reset the oldest_idle value */
+ databasePool->oldest_idle = (time_t) 0;
+ /* Copy the pgoptions */
+ databasePool->pgoptions = pstrdup(pgoptions);
+
+ if (!databasePool->database)
+ {
+ /* out of memory */
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ pfree(databasePool);
+ return NULL;
+ }
+
+ /* Init next reference */
+ databasePool->next = NULL;
+
+ /* Init node hashtable */
+ MemSet(&hinfo, 0, sizeof(hinfo));
+
+ hinfo.keysize = sizeof(Oid);
+ hinfo.entrysize = sizeof(PGXCNodePool);
+ hinfo.hcxt = dbcontext;
+
+ databasePool->nodePools = hash_create("Node Pool", MaxDataNodes + MaxCoords,
+ &hinfo,
+ HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /* Insert into the list */
+ insert_database_pool(databasePool);
+
+ return databasePool;
+}
+
+
+/*
+ * Destroy the pool and free memory
+ */
+static int
+destroy_database_pool(const char *database, const char *user_name)
+{
+ DatabasePool *databasePool;
+
+ elog(DEBUG1, "Destroy a connection pool to database %s, user %s",
+ database, user_name);
+
+ /* Delete from the list */
+ databasePool = remove_database_pool(database, user_name);
+ if (databasePool)
+ {
+ HASH_SEQ_STATUS hseq_status;
+ PGXCNodePool *nodePool;
+
+ hash_seq_init(&hseq_status, databasePool->nodePools);
+ while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ {
+ destroy_node_pool(nodePool);
+ }
+ /* free allocated memory */
+ MemoryContextDelete(databasePool->mcxt);
+ return 1;
+ }
+ return 0;
+}
+
+
+/*
+ * Insert new database pool to the list
+ */
+static void
+insert_database_pool(DatabasePool *databasePool)
+{
+ Assert(databasePool);
+
+ /* Reference existing list or null the tail */
+ if (databasePools)
+ databasePool->next = databasePools;
+ else
+ databasePool->next = NULL;
+
+ /* Update head pointer */
+ databasePools = databasePool;
+}
+
+/*
+ * reload_database_pools
+ * rebuild connection information for all database pools
+ *
+ * A database pool is reloaded as follows for each remote node:
+ *
+ * - node pool is deleted if the node has been deleted from catalog.
+ * Subsequently all its connections are dropped.
+ *
+ * - node pool is deleted if its port or host information is changed.
+ * Subsequently all its connections are dropped.
+ *
+ * - node pool is kept unchanged with existing connection information
+ * is not changed. However its index position in node pool is changed
+ * according to the alphabetical order of the node name in new
+ * cluster configuration.
+ *
+ * Backend sessions are responsible to reconnect to the pooler to update
+ * their agent with newest connection information.
+ *
+ * The session invocating connection information reload is reconnected
+ * and uploaded automatically after database pool reload. Other server
+ * sessions are signaled to reconnect to pooler and update their
+ * connection information separately.
+ *
+ * During reload process done internally on pooler, pooler is locked
+ * to forbid new connection requests.
+ */
+static void
+reload_database_pools(PoolAgent *agent)
+{
+ DatabasePool *databasePool;
+
+ elog(DEBUG1, "Reloading database pools");
+
+ /*
+ * Release node connections if any held. It is not guaranteed client session
+ * does the same so don't ever try to return them to pool and reuse
+ */
+ agent_release_connections(agent, true);
+
+ /* Forget previously allocated node info */
+ MemoryContextReset(agent->mcxt);
+
+ /* and allocate new */
+ PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
+ &agent->num_coord_connections, &agent->num_dn_connections, false);
+
+ agent->coord_connections = (PGXCNodePoolSlot **)
+ palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
+ agent->dn_connections = (PGXCNodePoolSlot **)
+ palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
+
+ /*
+ * Scan the list and destroy any altered pool. They will be recreated
+ * upon subsequent connection acquisition.
+ */
+ databasePool = databasePools;
+ while (databasePool)
+ {
+ /* Update each database pool slot with new connection information */
+ HASH_SEQ_STATUS hseq_status;
+ PGXCNodePool *nodePool;
+
+ hash_seq_init(&hseq_status, databasePool->nodePools);
+ while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ {
+ char *connstr_chk = build_node_conn_str(nodePool->nodeoid, databasePool);
+
+ if (connstr_chk == NULL || strcmp(connstr_chk, nodePool->connstr))
+ {
+ /* Node has been removed or altered */
+ destroy_node_pool(nodePool);
+ hash_search(databasePool->nodePools, &nodePool->nodeoid,
+ HASH_REMOVE, NULL);
+ }
+
+ if (connstr_chk)
+ pfree(connstr_chk);
+ }
+
+ databasePool = databasePool->next;
+ }
+}
+
+/*
+ * refresh_database_pools
+ * refresh information for all database pools
+ *
+ * Connection information refresh concerns all the database pools.
+ * A database pool is refreshed as follows for each remote node:
+ *
+ * - node pool is deleted if its port or host information is changed.
+ * Subsequently all its connections are dropped.
+ *
+ * If any other type of activity is found, we error out.
+ *
+ * XXX I don't see any cases that would error out. Isn't the comment
+ * simply obsolete?
+ */
+static int
+refresh_database_pools(PoolAgent *agent)
+{
+ DatabasePool *databasePool;
+ Oid *coOids;
+ Oid *dnOids;
+ int numCo;
+ int numDn;
+ int res = POOL_REFRESH_SUCCESS;
+
+ elog(LOG, "Refreshing database pools");
+
+ /*
+ * re-check if agent's node information matches current contents of the
+ * shared memory table.
+ */
+ PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
+
+ if (agent->num_coord_connections != numCo ||
+ agent->num_dn_connections != numDn ||
+ memcmp(agent->coord_conn_oids, coOids, numCo * sizeof(Oid)) ||
+ memcmp(agent->dn_conn_oids, dnOids, numDn * sizeof(Oid)))
+ res = POOL_REFRESH_FAILED;
+
+ /* Release palloc'ed memory */
+ pfree(coOids);
+ pfree(dnOids);
+
+ /*
+ * Scan the list and destroy any altered pool. They will be recreated
+ * upon subsequent connection acquisition.
+ */
+ databasePool = databasePools;
+ while (res == POOL_REFRESH_SUCCESS && databasePool)
+ {
+ HASH_SEQ_STATUS hseq_status;
+ PGXCNodePool *nodePool;
+
+ hash_seq_init(&hseq_status, databasePool->nodePools);
+ while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ {
+ char *connstr_chk = build_node_conn_str(nodePool->nodeoid, databasePool);
+
+ /*
+ * Since we re-checked the numbers above, we should not get
+ * the case of an ADDED or a DELETED node here..
+ */
+ if (connstr_chk == NULL)
+ {
+ elog(LOG, "Found a deleted node (%u)", nodePool->nodeoid);
+ hash_seq_term(&hseq_status);
+ res = POOL_REFRESH_FAILED;
+ break;
+ }
+
+ if (strcmp(connstr_chk, nodePool->connstr))
+ {
+ elog(LOG, "Found an altered node (%u)", nodePool->nodeoid);
+ /*
+ * Node has been altered. First remove
+ * all references to this node from ALL the
+ * agents before destroying it..
+ */
+ if (!remove_all_agent_references(nodePool->nodeoid))
+ {
+ res = POOL_REFRESH_FAILED;
+ break;
+ }
+
+ destroy_node_pool(nodePool);
+ hash_search(databasePool->nodePools, &nodePool->nodeoid,
+ HASH_REMOVE, NULL);
+ }
+
+ if (connstr_chk)
+ pfree(connstr_chk);
+ }
+
+ databasePool = databasePool->next;
+ }
+ return res;
+}
+
+static bool
+remove_all_agent_references(Oid nodeoid)
+{
+ int i, j;
+ bool res = true;
+
+ /*
+ * Identify if it's a coordinator or datanode first
+ * and get its index
+ */
+ for (i = 1; i <= agentCount; i++)
+ {
+ bool found = false;
+
+ PoolAgent *agent = poolAgents[i - 1];
+ for (j = 0; j < agent->num_dn_connections; j++)
+ {
+ if (agent->dn_conn_oids[j] == nodeoid)
+ {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ {
+ PGXCNodePoolSlot *slot = agent->dn_connections[j];
+ if (slot)
+ release_connection(agent->pool, slot, agent->dn_conn_oids[j], false);
+ agent->dn_connections[j] = NULL;
+ }
+ else
+ {
+ for (j = 0; j < agent->num_coord_connections; j++)
+ {
+ if (agent->coord_conn_oids[j] == nodeoid)
+ {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ {
+ PGXCNodePoolSlot *slot = agent->coord_connections[j];
+ if (slot)
+ release_connection(agent->pool, slot, agent->coord_conn_oids[j], true);
+ agent->coord_connections[j] = NULL;
+ }
+ else
+ {
+ elog(LOG, "Node not found! (%u)", nodeoid);
+ res = false;
+ }
+ }
+ }
+ return res;
+}
+
+/*
+ * Find pool for specified database and username in the list
+ */
+static DatabasePool *
+find_database_pool(const char *database, const char *user_name, const char *pgoptions)
+{
+ DatabasePool *databasePool;
+
+ /* Scan the list */
+ databasePool = databasePools;
+ while (databasePool)
+ {
+ if (strcmp(database, databasePool->database) == 0 &&
+ strcmp(user_name, databasePool->user_name) == 0 &&
+ strcmp(pgoptions, databasePool->pgoptions) == 0)
+ break;
+ databasePool = databasePool->next;
+ }
+ return databasePool;
+}
+
+
+/*
+ * Remove pool for specified database from the list
+ */
+static DatabasePool *
+remove_database_pool(const char *database, const char *user_name)
+{
+ DatabasePool *databasePool,
+ *prev;
+
+ /* Scan the list */
+ databasePool = databasePools;
+ prev = NULL;
+ while (databasePool)
+ {
+
+ /* if match break the loop and return */
+ if (strcmp(database, databasePool->database) == 0 &&
+ strcmp(user_name, databasePool->user_name) == 0)
+ break;
+ prev = databasePool;
+ databasePool = databasePool->next;
+ }
+
+ /* if found */
+ if (databasePool)
+ {
+
+ /* Remove entry from chain or update head */
+ if (prev)
+ prev->next = databasePool->next;
+ else
+ databasePools = databasePool->next;
+
+
+ databasePool->next = NULL;
+ }
+ return databasePool;
+}
+
+/*
+ * Acquire connection
+ */
+static PGXCNodePoolSlot *
+acquire_connection(DatabasePool *dbPool, Oid node)
+{
+ PGXCNodePool *nodePool;
+ PGXCNodePoolSlot *slot;
+
+ Assert(dbPool);
+
+ nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
+ NULL);
+
+ /*
+ * When a Coordinator pool is initialized by a Coordinator Postmaster,
+ * it has a NULL size and is below minimum size that is 1
+ * This is to avoid problems of connections between Coordinators
+ * when creating or dropping Databases.
+ */
+ if (nodePool == NULL || nodePool->freeSize == 0)
+ nodePool = grow_pool(dbPool, node);
+
+ slot = NULL;
+ /* Check available connections */
+ while (nodePool && nodePool->freeSize > 0)
+ {
+ int poll_result;
+
+ slot = nodePool->slot[--(nodePool->freeSize)];
+
+ retry:
+ if (PQsocket((PGconn *) slot->conn) > 0)
+ {
+ /*
+ * Make sure connection is ok, destroy connection slot if there is a
+ * problem.
+ */
+ poll_result = pqReadReady((PGconn *) slot->conn);
+
+ if (poll_result == 0)
+ break; /* ok, no data */
+ else if (poll_result < 0)
+ {
+ if (errno == EAGAIN || errno == EINTR)
+ goto retry;
+
+ elog(WARNING, "Error in checking connection, errno = %d", errno);
+ }
+ else
+ elog(WARNING, "Unexpected data on connection, cleaning.");
+ }
+
+ destroy_slot(slot);
+ slot = NULL;
+
+ /* Decrement current max pool size */
+ (nodePool->size)--;
+ /* Ensure we are not below minimum size */
+ nodePool = grow_pool(dbPool, node);
+ }
+
+ if (slot == NULL)
+ {
+ elog(WARNING, "can not connect to node %u", node);
+
+ /*
+ * before returning, also update the shared health
+ * status field to indicate that this node is down
+ */
+ if (!PgxcNodeUpdateHealth(node, false))
+ elog(WARNING, "Could not update health status of node %u", node);
+ else
+ elog(WARNING, "Health map updated to reflect DOWN node (%u)", node);
+ }
+ else
+ PgxcNodeUpdateHealth(node, true);
+
+ return slot;
+}
+
+
+/*
+ * release connection from specified pool and slot
+ */
+static void
+release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
+ Oid node, bool force_destroy)
+{
+ PGXCNodePool *nodePool;
+
+ Assert(dbPool);
+ Assert(slot);
+
+ nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
+ NULL);
+ if (nodePool == NULL)
+ {
+ /*
+ * The node may be altered or dropped.
+ * In any case the slot is no longer valid.
+ */
+ destroy_slot(slot);
+ return;
+ }
+
+ /* return or discard */
+ if (!force_destroy)
+ {
+ /* Insert the slot into the array and increase pool size */
+ nodePool->slot[(nodePool->freeSize)++] = slot;
+ slot->released = time(NULL);
+ }
+ else
+ {
+ elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
+ destroy_slot(slot);
+ /* Decrement pool size */
+ (nodePool->size)--;
+ /* Ensure we are not below minimum size */
+ grow_pool(dbPool, node);
+ }
+}
+
+
+/*
+ * Increase database pool size, create new if does not exist
+ */
+static PGXCNodePool *
+grow_pool(DatabasePool *dbPool, Oid node)
+{
+ /* if error try to release idle connections and try again */
+ bool tryagain = true;
+ PGXCNodePool *nodePool;
+ bool found;
+
+ Assert(dbPool);
+
+ nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+ HASH_ENTER, &found);
+ nodePool->connstr = build_node_conn_str(node, dbPool);
+ if (!nodePool->connstr)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("could not build connection string for node %u", node)));
+ }
+
+ if (!found)
+ {
+ nodePool->slot = (PGXCNodePoolSlot **) palloc0(MaxPoolSize * sizeof(PGXCNodePoolSlot *));
+ if (!nodePool->slot)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ nodePool->freeSize = 0;
+ nodePool->size = 0;
+ }
+
+ while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)
+ {
+ PGXCNodePoolSlot *slot;
+
+ /* Allocate new slot */
+ slot = (PGXCNodePoolSlot *) palloc(sizeof(PGXCNodePoolSlot));
+ if (slot == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /* If connection fails, be sure that slot is destroyed cleanly */
+ slot->xc_cancelConn = NULL;
+
+ /* Establish connection */
+ slot->conn = PGXCNodeConnect(nodePool->connstr);
+ if (!PGXCNodeConnected(slot->conn))
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("failed to connect to node, connection string (%s),"
+ " connection error (%s)",
+ nodePool->connstr,
+ PQerrorMessage((PGconn*) slot->conn))));
+ destroy_slot(slot);
+ /*
+ * If we failed to connect probably number of connections on the
+ * target node reached max_connections. Try and release idle
+ * connections and try again.
+ * We do not want to enter endless loop here and run maintenance
+ * procedure only once.
+ * It is not safe to run the maintenance procedure if no connections
+ * from that pool currently in use - the node pool may be destroyed
+ * in that case.
+ */
+ if (tryagain && nodePool->size > nodePool->freeSize)
+ {
+ pools_maintenance();
+ tryagain = false;
+ continue;
+ }
+ break;
+ }
+
+ slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
+ slot->released = time(NULL);
+ if (dbPool->oldest_idle == (time_t) 0)
+ dbPool->oldest_idle = slot->released;
+
+ /* Insert at the end of the pool */
+ nodePool->slot[(nodePool->freeSize)++] = slot;
+
+ /* Increase count of pool size */
+ (nodePool->size)++;
+ elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
+ nodePool->size,
+ nodePool->connstr);
+ }
+ return nodePool;
+}
+
+
+/*
+ * Destroy pool slot
+ */
+static void
+destroy_slot(PGXCNodePoolSlot *slot)
+{
+ if (!slot)
+ return;
+
+ PQfreeCancel((PGcancel *)slot->xc_cancelConn);
+ PGXCNodeClose(slot->conn);
+ pfree(slot);
+}
+
+
+/*
+ * Destroy node pool
+ */
+static void
+destroy_node_pool(PGXCNodePool *node_pool)
+{
+ int i;
+
+ if (!node_pool)
+ return;
+
+ /*
+ * At this point all agents using connections from this pool should be already closed
+ * If this not the connections to the Datanodes assigned to them remain open, this will
+ * consume Datanode resources.
+ */
+ elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
+ node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
+ if (node_pool->connstr)
+ pfree(node_pool->connstr);
+
+ if (node_pool->slot)
+ {
+ for (i = 0; i < node_pool->freeSize; i++)
+ destroy_slot(node_pool->slot[i]);
+ pfree(node_pool->slot);
+ }
+}
+
+
+/*
+ * Main handling loop
+ */
+static void
+PoolerLoop(void)
+{
+ StringInfoData input_message;
+ time_t last_maintenance = (time_t) 0;
+ int maintenance_timeout;
+ struct pollfd *pool_fd;
+
+#ifdef HAVE_UNIX_SOCKETS
+ if (Unix_socket_directories)
+ {
+ char *rawstring;
+ List *elemlist;
+ ListCell *l;
+ int success = 0;
+
+ /* Need a modifiable copy of Unix_socket_directories */
+ rawstring = pstrdup(Unix_socket_directories);
+
+ /* Parse string into list of directories */
+ if (!SplitDirectoriesString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax in parameter \"%s\"",
+ "unix_socket_directories")));
+ }
+
+ foreach(l, elemlist)
+ {
+ char *socketdir = (char *) lfirst(l);
+ int saved_errno;
+
+ /* Connect to the pooler */
+ server_fd = pool_listen(PoolerPort, socketdir);
+ if (server_fd < 0)
+ {
+ saved_errno = errno;
+ ereport(WARNING,
+ (errmsg("could not create Unix-domain socket in directory \"%s\", errno %d, server_fd %d",
+ socketdir, saved_errno, server_fd)));
+ }
+ else
+ {
+ success++;
+ }
+ }
+
+ if (!success && elemlist != NIL)
+ ereport(ERROR,
+ (errmsg("failed to start listening on Unix-domain socket for pooler: %m")));
+
+ list_free_deep(elemlist);
+ pfree(rawstring);
+ }
+#endif
+
+ pool_fd = (struct pollfd *) palloc((MaxConnections + 1) * sizeof(struct pollfd));
+
+ if (server_fd == -1)
+ {
+ /* log error */
+ return;
+ }
+
+ initStringInfo(&input_message);
+
+ pool_fd[0].fd = server_fd;
+ pool_fd[0].events = POLLIN;
+
+ for (;;)
+ {
+
+ int retval;
+ int i;
+
+ /*
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
+ */
+ if (!PostmasterIsAlive())
+ exit(1);
+
+ /* watch for incoming messages */
+ for (i = 1; i <= agentCount; i++)
+ {
+ PoolAgent *agent = poolAgents[i - 1];
+ int sockfd = Socket(agent->port);
+ pool_fd[i].fd = sockfd;
+ pool_fd[i].events = POLLIN;
+ }
+
+ if (PoolMaintenanceTimeout > 0)
+ {
+ int timeout_val;
+ double timediff;
+
+ /*
+ * Decide the timeout value based on when the last
+ * maintenance activity was carried out. If the last
+ * maintenance was done quite a while ago schedule the select
+ * with no timeout. It will serve any incoming activity
+ * and if there's none it will cause the maintenance
+ * to be scheduled as soon as possible
+ */
+ timediff = difftime(time(NULL), last_maintenance);
+
+ if (timediff > PoolMaintenanceTimeout)
+ timeout_val = 0;
+ else
+ timeout_val = PoolMaintenanceTimeout - rint(timediff);
+
+ maintenance_timeout = timeout_val * 1000;
+ }
+ else
+ maintenance_timeout = -1;
+ /*
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
+ */
+ if (!PostmasterIsAlive())
+ exit(1);
+
+ /*
+ * Process any requests or signals received recently.
+ */
+ if (got_SIGHUP)
+ {
+ got_SIGHUP = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ }
+
+ if (shutdown_requested)
+ {
+ for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
+ {
+ PoolAgent *agent = poolAgents[i];
+ agent_destroy(agent);
+ }
+
+ while (databasePools)
+ if (destroy_database_pool(databasePools->database,
+ databasePools->user_name) == 0)
+ break;
+
+ close(server_fd);
+ exit(0);
+ }
+
+ /* wait for event */
+ retval = poll(pool_fd, agentCount + 1, maintenance_timeout);
+ if (retval < 0)
+ {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ elog(FATAL, "poll returned with error %d", retval);
+ }
+
+ if (retval > 0)
+ {
+ /*
+ * Agent may be removed from the array while processing
+ * and trailing items are shifted, so scroll downward
+ * to avoid problem
+ */
+ for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
+ {
+ PoolAgent *agent = poolAgents[i];
+ int sockfd = Socket(agent->port);
+
+ if ((sockfd == pool_fd[i + 1].fd) &&
+ (pool_fd[i + 1].revents & POLLIN))
+ agent_handle_input(agent, &input_message);
+ }
+
+ if (pool_fd[0].revents & POLLIN)
+ agent_create();
+ }
+ else if (retval == 0)
+ {
+ /* maintenance timeout */
+ pools_maintenance();
+ PoolPingNodes();
+ last_maintenance = time(NULL);
+ }
+ }
+}
+
+/*
+ * Clean Connection in all Database Pools for given Datanode and Coordinator list
+ */
+int
+clean_connection(List *node_discard, const char *database, const char *user_name)
+{
+ DatabasePool *databasePool;
+ int res = CLEAN_CONNECTION_COMPLETED;
+
+ databasePool = databasePools;
+
+ while (databasePool)
+ {
+ ListCell *lc;
+
+ if ((database && strcmp(database, databasePool->database)) ||
+ (user_name && strcmp(user_name, databasePool->user_name)))
+ {
+ /* The pool does not match to request, skip */
+ databasePool = databasePool->next;
+ continue;
+ }
+
+ /*
+ * Clean each requested node pool
+ */
+ foreach(lc, node_discard)
+ {
+ PGXCNodePool *nodePool;
+ Oid node = lfirst_oid(lc);
+
+ nodePool = hash_search(databasePool->nodePools, &node, HASH_FIND,
+ NULL);
+
+ if (nodePool)
+ {
+ /* Check if connections are in use */
+ if (nodePool->freeSize < nodePool->size)
+ {
+ elog(WARNING, "Pool of Database %s is using Datanode %u connections",
+ databasePool->database, node);
+ res = CLEAN_CONNECTION_NOT_COMPLETED;
+ }
+
+ /* Destroy connections currently in Node Pool */
+ if (nodePool->slot)
+ {
+ int i;
+ for (i = 0; i < nodePool->freeSize; i++)
+ destroy_slot(nodePool->slot[i]);
+ }
+ nodePool->size -= nodePool->freeSize;
+ nodePool->freeSize = 0;
+ }
+ }
+
+ databasePool = databasePool->next;
+ }
+
+ /* Release lock on Pooler, to allow transactions to connect again. */
+ is_pool_locked = false;
+ return res;
+}
+
+/*
+ * Take a Lock on Pooler.
+ * Abort PIDs registered with the agents for the given database.
+ * Send back to client list of PIDs signaled to watch them.
+ */
+int *
+abort_pids(int *len, int pid, const char *database, const char *user_name)
+{
+ int *pids = NULL;
+ int i = 0;
+ int count;
+
+ Assert(!is_pool_locked);
+ Assert(agentCount > 0);
+
+ is_pool_locked = true;
+
+ pids = (int *) palloc((agentCount - 1) * sizeof(int));
+
+ /* Send a SIGTERM signal to all processes of Pooler agents except this one */
+ for (count = 0; count < agentCount; count++)
+ {
+ if (poolAgents[count]->pid == pid)
+ continue;
+
+ if (database && strcmp(poolAgents[count]->pool->database, database) != 0)
+ continue;
+
+ if (user_name && strcmp(poolAgents[count]->pool->user_name, user_name) != 0)
+ continue;
+
+ if (kill(poolAgents[count]->pid, SIGTERM) < 0)
+ elog(ERROR, "kill(%ld,%d) failed: %m",
+ (long) poolAgents[count]->pid, SIGTERM);
+
+ pids[i++] = poolAgents[count]->pid;
+ }
+
+ *len = i;
+
+ return pids;
+}
+
+/*
+ *
+ */
+static void
+pooler_die(SIGNAL_ARGS)
+{
+ shutdown_requested = true;
+}
+
+
+/*
+ *
+ */
+static void
+pooler_quickdie(SIGNAL_ARGS)
+{
+ PG_SETMASK(&BlockSig);
+ exit(2);
+}
+
+
+static void
+pooler_sighup(SIGNAL_ARGS)
+{
+ got_SIGHUP = true;
+}
+
+/*
+ * Given node identifier, dbname and user name build connection string.
+ * Get node connection details from the shared memory node table
+ */
+static char *
+build_node_conn_str(Oid node, DatabasePool *dbPool)
+{
+ NodeDefinition *nodeDef;
+ char *connstr;
+
+ nodeDef = PgxcNodeGetDefinition(node);
+ if (nodeDef == NULL)
+ {
+ /* No such definition, node is dropped? */
+ return NULL;
+ }
+
+ connstr = PGXCNodeConnStr(NameStr(nodeDef->nodehost),
+ nodeDef->nodeport,
+ dbPool->database,
+ dbPool->user_name,
+ dbPool->pgoptions,
+ IS_PGXC_COORDINATOR ? "coordinator" : "datanode",
+ PGXCNodeName);
+ pfree(nodeDef);
+
+ return connstr;
+}
+
+/*
+ * Check all pooled connections, and close which have been released more then
+ * PooledConnKeepAlive seconds ago.
+ * Return true if shrink operation closed all the connections and pool can be
+ * ddestroyed, false if there are still connections or pool is in use.
+ */
+static bool
+shrink_pool(DatabasePool *pool)
+{
+ time_t now = time(NULL);
+ HASH_SEQ_STATUS hseq_status;
+ PGXCNodePool *nodePool;
+ int i;
+ bool empty = true;
+
+ /* Negative PooledConnKeepAlive disables automatic connection cleanup */
+ if (PoolConnKeepAlive < 0)
+ return false;
+
+ pool->oldest_idle = (time_t) 0;
+ hash_seq_init(&hseq_status, pool->nodePools);
+ while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ {
+ /* Go thru the free slots and destroy those that are free too long */
+ for (i = 0; i < nodePool->freeSize; )
+ {
+ PGXCNodePoolSlot *slot = nodePool->slot[i];
+
+ if (difftime(now, slot->released) > PoolConnKeepAlive)
+ {
+ /* connection is idle for long, close it */
+ destroy_slot(slot);
+ /* reduce pool size and total number of connections */
+ (nodePool->freeSize)--;
+ (nodePool->size)--;
+ /* move last connection in place, if not at last already */
+ if (i < nodePool->freeSize)
+ nodePool->slot[i] = nodePool->slot[nodePool->freeSize];
+ }
+ else
+ {
+ if (pool->oldest_idle == (time_t) 0 ||
+ difftime(pool->oldest_idle, slot->released) > 0)
+ pool->oldest_idle = slot->released;
+
+ i++;
+ }
+ }
+ if (nodePool->size > 0)
+ empty = false;
+ else
+ {
+ destroy_node_pool(nodePool);
+ hash_search(pool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL);
+ }
+ }
+
+ /*
+ * Last check, if any active agent is referencing the pool do not allow to
+ * destroy it, because there will be a problem if session wakes up and try
+ * to get a connection from non existing pool.
+ * If all such sessions will eventually disconnect the pool will be
+ * destroyed during next maintenance procedure.
+ */
+ if (empty)
+ {
+ for (i = 0; i < agentCount; i++)
+ {
+ if (poolAgents[i]->pool == pool)
+ return false;
+ }
+ }
+
+ return empty;
+}
+
+
+/*
+ * Scan connection pools and release connections which are idle for long.
+ * If pool gets empty after releasing connections it is destroyed.
+ */
+static void
+pools_maintenance(void)
+{
+ DatabasePool *prev = NULL;
+ DatabasePool *curr = databasePools;
+ time_t now = time(NULL);
+ int count = 0;
+
+ /* Iterate over the pools */
+ while (curr)
+ {
+ /*
+ * If current pool has connections to close and it is emptied after
+ * shrink remove the pool and free memory.
+ * Otherwithe move to next pool.
+ */
+ if (curr->oldest_idle != (time_t) 0 &&
+ difftime(now, curr->oldest_idle) > PoolConnKeepAlive &&
+ shrink_pool(curr))
+ {
+ MemoryContext mem = curr->mcxt;
+ curr = curr->next;
+ if (prev)
+ prev->next = curr;
+ else
+ databasePools = curr;
+ MemoryContextDelete(mem);
+ count++;
+ }
+ else
+ {
+ prev = curr;
+ curr = curr->next;
+ }
+ }
+ elog(DEBUG1, "Pool maintenance, done in %f seconds, removed %d pools",
+ difftime(time(NULL), now), count);
+}
+
+bool
+check_persistent_connections(bool *newval, void **extra, GucSource source)
+{
+ if (*newval && IS_PGXC_DATANODE)
+ {
+ elog(WARNING, "persistent_datanode_connections = ON is currently not "
+ "supported on datanodes - ignoring");
+ *newval = false;
+ }
+ return true;
+}
--- /dev/null
- static LWLockTranche SharedQueueLocksTranche;
+/*-------------------------------------------------------------------------
+ *
+ * squeue.c
+ *
+ * Shared queue is for data exchange in shared memory between sessions,
+ * one of which is a producer, providing data rows. Others are consumer agents -
+ * sessions initiated from other datanodes, the main purpose of them is to read
+ * rows from the shared queue and send then to the parent data node.
+ * The producer is usually a consumer at the same time, it sends back tuples
+ * to the parent node without putting it to the queue.
+ *
+ * Copyright (c) 2012-2014, TransLattice, Inc.
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/time.h>
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/gtm.h"
+#include "catalog/pgxc_node.h"
+#include "commands/prepare.h"
+#include "executor/executor.h"
+#include "nodes/pg_list.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
+#include "pgxc/squeue.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/hsearch.h"
+#include "utils/resowner.h"
++#include "pgstat.h"
+
+
+int NSQueues = 64;
+int SQueueSize = 64;
+
+#define LONG_TUPLE -42
+
+typedef struct ConsumerSync
+{
+ LWLock *cs_lwlock; /* Synchronize access to the consumer queue */
+ Latch cs_latch; /* The latch consumer is waiting on */
+} ConsumerSync;
+
+
+/*
+ * Shared memory structure to store synchronization info to access shared queues
+ */
+typedef struct SQueueSync
+{
+ void *queue; /* NULL if not assigned to any queue */
+ Latch sqs_producer_latch; /* the latch producer is waiting on */
+ ConsumerSync sqs_consumer_sync[0]; /* actual length is MaxDataNodes-1 is
+ * not known on compile time */
+} SQueueSync;
+
+/* Both producer and consumer are working */
+#define CONSUMER_ACTIVE 0
+/* Producer have finished work successfully and waits for consumer */
+#define CONSUMER_EOF 1
+/* Producer encountered error and waits for consumer to disconnect */
+#define CONSUMER_ERROR 2
+/* Consumer is finished with the query, OK to unbind */
+#define CONSUMER_DONE 3
+
+
+/* State of a single consumer */
+typedef struct
+{
+ int cs_pid; /* Process id of the consumer session */
+ int cs_node; /* Node id of the consumer parent */
+ /*
+ * Queue state. The queue is a cyclic queue where stored tuples in the
+ * DataRow format, first goes the lengths of the tuple in host format,
+ * because it never sent over network followed by tuple bytes.
+ */
+ int cs_ntuples; /* Number of tuples in the queue */
+ int cs_status; /* See CONSUMER_* defines above */
+ char *cs_qstart; /* Where consumer queue begins */
+ int cs_qlength; /* The size of the consumer queue */
+ int cs_qreadpos; /* The read position in the consumer queue */
+ int cs_qwritepos; /* The write position in the consumer queue */
+#ifdef SQUEUE_STAT
+ long stat_writes;
+ long stat_reads;
+ long stat_buff_writes;
+ long stat_buff_reads;
+ long stat_buff_returns;
+#endif
+} ConsState;
+
+/* Shared queue header */
+typedef struct SQueueHeader
+{
+ char sq_key[SQUEUE_KEYSIZE]; /* Hash entry key should be at the
+ * beginning of the hash entry */
+ int sq_pid; /* Process id of the producer session */
+ int sq_nodeid; /* Node id of the producer parent */
+ SQueueSync *sq_sync; /* Associated sinchronization objects */
+ int sq_refcnt; /* Reference count to this entry */
+#ifdef SQUEUE_STAT
+ bool stat_finish;
+ long stat_paused;
+#endif
+ int sq_nconsumers; /* Number of consumers */
+ ConsState sq_consumers[0];/* variable length array */
+} SQueueHeader;
+
+
+/*
+ * Hash table where all shared queues are stored. Key is the queue name, value
+ * is SharedQueue
+ */
+static HTAB *SharedQueues = NULL;
+static LWLockPadded *SQueueLocks = NULL;
- SharedQueueLocksTranche.name = "Shared Queue Locks";
- SharedQueueLocksTranche.array_base = SQueueLocks;
- SharedQueueLocksTranche.array_stride = sizeof(LWLockPadded);
-
+
+/*
+ * Pool of synchronization items
+ */
+static void *SQueueSyncs;
+
+#define SQUEUE_SYNC_SIZE \
+ (sizeof(SQueueSync) + (MaxDataNodes-1) * sizeof(ConsumerSync))
+
+#define GET_SQUEUE_SYNC(idx) \
+ ((SQueueSync *) (((char *) SQueueSyncs) + (idx) * SQUEUE_SYNC_SIZE))
+
+#define SQUEUE_HDR_SIZE(nconsumers) \
+ (sizeof(SQueueHeader) + (nconsumers) * sizeof(ConsState))
+
+#define QUEUE_FREE_SPACE(cstate) \
+ ((cstate)->cs_ntuples > 0 ? \
+ ((cstate)->cs_qreadpos >= (cstate)->cs_qwritepos ? \
+ (cstate)->cs_qreadpos - (cstate)->cs_qwritepos : \
+ (cstate)->cs_qlength + (cstate)->cs_qreadpos \
+ - (cstate)->cs_qwritepos) \
+ : (cstate)->cs_qlength)
+
+#define QUEUE_WRITE(cstate, len, buf) \
+ do \
+ { \
+ if ((cstate)->cs_qwritepos + (len) <= (cstate)->cs_qlength) \
+ { \
+ memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, len); \
+ (cstate)->cs_qwritepos += (len); \
+ if ((cstate)->cs_qwritepos == (cstate)->cs_qlength) \
+ (cstate)->cs_qwritepos = 0; \
+ } \
+ else \
+ { \
+ int part = (cstate)->cs_qlength - (cstate)->cs_qwritepos; \
+ memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, part); \
+ (cstate)->cs_qwritepos = (len) - part; \
+ memcpy((cstate)->cs_qstart, (buf) + part, (cstate)->cs_qwritepos); \
+ } \
+ } while(0)
+
+
+#define QUEUE_READ(cstate, len, buf) \
+ do \
+ { \
+ if ((cstate)->cs_qreadpos + (len) <= (cstate)->cs_qlength) \
+ { \
+ memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, len); \
+ (cstate)->cs_qreadpos += (len); \
+ if ((cstate)->cs_qreadpos == (cstate)->cs_qlength) \
+ (cstate)->cs_qreadpos = 0; \
+ } \
+ else \
+ { \
+ int part = (cstate)->cs_qlength - (cstate)->cs_qreadpos; \
+ memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, part); \
+ (cstate)->cs_qreadpos = (len) - part; \
+ memcpy((buf) + part, (cstate)->cs_qstart, (cstate)->cs_qreadpos); \
+ } \
+ } while(0)
+
+
+static bool sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow);
+static void sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
+ ConsumerSync *sync);
+
+/*
+ * SharedQueuesInit
+ * Initialize the reference on the shared memory hash table where all shared
+ * queues are stored. Invoked during postmaster initialization.
+ */
+void
+SharedQueuesInit(void)
+{
+ HASHCTL info;
+ int hash_flags;
+ bool found;
+
+ info.keysize = SQUEUE_KEYSIZE;
+ info.entrysize = SQUEUE_SIZE;
+
+ /*
+ * Create hash table of fixed size to avoid running out of
+ * SQueueSyncs
+ */
+ hash_flags = HASH_ELEM | HASH_FIXED_SIZE;
+
+ SharedQueues = ShmemInitHash("Shared Queues", NUM_SQUEUES,
+ NUM_SQUEUES, &info, hash_flags);
+
+ /*
+ * Synchronization stuff is in separate structure because we need to
+ * initialize all items now while in the postmaster.
+ * The structure is actually an array, each array entry is assigned to
+ * each instance of SharedQueue in use.
+ */
+ SQueueSyncs = ShmemInitStruct("Shared Queues Sync",
+ SQUEUE_SYNC_SIZE * NUM_SQUEUES,
+ &found);
+ if (!found)
+ {
+ int i, l;
+ int nlocks = (NUM_SQUEUES * (MaxDataNodes-1));
+ bool foundLocks;
+
+ /* Initialize LWLocks for queues */
+ SQueueLocks = (LWLockPadded *) ShmemInitStruct("Shared Queue Locks",
+ sizeof(LWLockPadded) * nlocks, &foundLocks);
+
+ /* either both syncs and locks, or none of them */
+ Assert(! foundLocks);
+
- LWLockRegisterTranche(LWTRANCHE_SHARED_QUEUES, &SharedQueueLocksTranche);
+ /* Register the trannche tranche in the main tranches array */
- WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
++ LWLockRegisterTranche(LWTRANCHE_SHARED_QUEUES, "Shared Queue Locks");
+
+ l = 0;
+ for (i = 0; i < NUM_SQUEUES; i++)
+ {
+ SQueueSync *sqs = GET_SQUEUE_SYNC(i);
+ int j;
+
+ sqs->queue = NULL;
+ InitSharedLatch(&sqs->sqs_producer_latch);
+ for (j = 0; j < MaxDataNodes-1; j++)
+ {
+ InitSharedLatch(&sqs->sqs_consumer_sync[j].cs_latch);
+
+ LWLockInitialize(&(SQueueLocks[l]).lock,
+ LWTRANCHE_SHARED_QUEUES);
+
+ sqs->sqs_consumer_sync[j].cs_lwlock = &(SQueueLocks[l++]).lock;
+ }
+ }
+ }
+}
+
+
+Size
+SharedQueueShmemSize(void)
+{
+ Size sqs_size;
+
+ sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE);
+ return add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, SQUEUE_SIZE));
+}
+
+/*
+ * SharedQueueAcquire
+ * Reserve a named shared queue for future data exchange between processes
+ * supplying tuples to remote Datanodes. Invoked when a remote query plan is
+ * registered on the Datanode. The number of consumers is known at this point,
+ * so shared queue may be formatted during reservation. The first process that
+ * is acquiring the shared queue on the Datanode does the formatting.
+ */
+void
+SharedQueueAcquire(const char *sqname, int ncons)
+{
+ bool found;
+ SharedQueue sq;
+ int trycount = 0;
+
+ Assert(IsConnFromDatanode());
+ Assert(ncons > 0);
+
+tryagain:
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+ sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_ENTER, &found);
+ if (!sq)
+ ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("out of shared queue, please increase shared_queues")));
+
+ /* First process acquiring queue should format it */
+ if (!found)
+ {
+ int qsize; /* Size of one queue */
+ int i;
+ char *heapPtr;
+
+ elog(DEBUG1, "Create a new SQueue %s and format it for %d consumers", sqname, ncons);
+
+ /* Initialize the shared queue */
+ sq->sq_pid = 0;
+ sq->sq_nodeid = -1;
+ sq->sq_refcnt = 1;
+#ifdef SQUEUE_STAT
+ sq->stat_finish = false;
+ sq->stat_paused = 0;
+#endif
+ /*
+ * Assign sync object (latches to wait on)
+ * XXX We may want to optimize this and do smart search instead of
+ * iterating the array.
+ */
+ for (i = 0; i < NUM_SQUEUES; i++)
+ {
+ SQueueSync *sqs = GET_SQUEUE_SYNC(i);
+ if (sqs->queue == NULL)
+ {
+ sqs->queue = (void *) sq;
+ sq->sq_sync = sqs;
+ break;
+ }
+ }
+
+ Assert(sq->sq_sync != NULL);
+
+ sq->sq_nconsumers = ncons;
+ /* Determine queue size for a single consumer */
+ qsize = (SQUEUE_SIZE - SQUEUE_HDR_SIZE(sq->sq_nconsumers)) / sq->sq_nconsumers;
+
+ heapPtr = (char *) sq;
+ /* Skip header */
+ heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers);
+ /* Set up consumer queues */
+ for (i = 0; i < ncons; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+
+ cstate->cs_pid = 0;
+ cstate->cs_node = -1;
+ cstate->cs_ntuples = 0;
+ cstate->cs_status = CONSUMER_ACTIVE;
+ cstate->cs_qstart = heapPtr;
+ cstate->cs_qlength = qsize;
+ cstate->cs_qreadpos = 0;
+ cstate->cs_qwritepos = 0;
+ heapPtr += qsize;
+ }
+ Assert(heapPtr <= ((char *) sq) + SQUEUE_SIZE);
+ }
+ else
+ {
+ int i;
+
+ elog(DEBUG1, "Found an existing SQueue %s - (sq_pid:%d, sq_nodeid:%d,"
+ " sq_nconsumers:%d",
+ sqname, sq->sq_pid, sq->sq_nodeid, sq->sq_nconsumers);
+
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ elog(DEBUG1, "SQueue %s, consumer (%d) information (cs_pid:%d,"
+ " cs_node:%d, cs_ntuples:%d, cs_status: %d",
+ sqname, i,
+ sq->sq_consumers[i].cs_pid,
+ sq->sq_consumers[i].cs_node,
+ sq->sq_consumers[i].cs_ntuples,
+ sq->sq_consumers[i].cs_status);
+ }
+
+ /*
+ * A race condition is possible here. The previous operation might use
+ * the same Shared Queue name if that was different execution of the
+ * same Portal. So here we should try to determine if that Shared Queue
+ * belongs to this execution or that is not-yet-released Shared Queue
+ * of previous operation.
+ * Though at the moment I am not sure, but I believe the BIND stage is
+ * only happening after completion of ACQUIRE stage, so it is enough
+ * to verify the producer (the very first node that binds) is not bound
+ * yet. If it is bound, sleep for a moment and try again. No reason to
+ * sleep longer, the producer needs just a quantum of CPU time to UNBIND
+ * itself.
+ */
+ if (sq->sq_pid != 0)
+ {
+ int i;
+ bool old_squeue = true;
+
+ PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
+ &PGXC_PARENT_NODE_TYPE);
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == PGXC_PARENT_NODE_ID)
+ {
+ SQueueSync *sqsync = sq->sq_sync;
+
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ /* verify status */
+ if (cstate->cs_status != CONSUMER_DONE)
+ old_squeue = false;
+
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ break;
+ }
+ }
+ if (old_squeue)
+ {
+ LWLockRelease(SQueuesLock);
+ pg_usleep(1000000L);
+ elog(DEBUG1, "SQueue race condition, give the old producer to "
+ "finish the work and retry again");
+ trycount++;
+ if (trycount >= 10)
+ elog(ERROR, "Couldn't resolve SQueue race condition after"
+ " %d tries", trycount);
+ goto tryagain;
+ }
+ }
+ sq->sq_refcnt++;
+ }
+ LWLockRelease(SQueuesLock);
+}
+
+
+/*
+ * SharedQueueBind
+ * Bind to the shared queue specified by sqname either as a consumer or as a
+ * producer. The first process that binds to the shared queue becomes a producer
+ * and receives the consumer map, others become consumers and receive queue
+ * indexes to read tuples from.
+ * The consNodes int list identifies the nodes involved in the current step.
+ * The distNodes int list describes result distribution of the current step.
+ * The consNodes should be a subset of distNodes.
+ * The myindex and consMap parameters are binding results. If caller process
+ * is bound to the query as a producer myindex is set to -1 and index of the
+ * each consumer (order number in the consNodes) is stored to the consMap array
+ * at the position of the node in the distNodes. For the producer node
+ * SQ_CONS_SELF is stored, nodes from distNodes list which are not members of
+ * consNodes or if it was reported they won't read results, they are represented
+ * as SQ_CONS_NONE.
+ */
+SharedQueue
+SharedQueueBind(const char *sqname, List *consNodes,
+ List *distNodes, int *myindex, int *consMap)
+{
+ bool found;
+ SharedQueue sq;
+
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+ PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
+ &PGXC_PARENT_NODE_TYPE);
+ sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
+ if (!found)
+ elog(PANIC, "Shared queue %s not found", sqname);
+ if (sq->sq_pid == 0)
+ {
+ /* Producer */
+ int i;
+ ListCell *lc;
+
+ Assert(consMap);
+
+ elog(DEBUG1, "Bind node %s to squeue of step %s as a producer",
+ PGXC_PARENT_NODE, sqname);
+
+ /* Initialize the shared queue */
+ sq->sq_pid = MyProcPid;
+ sq->sq_nodeid = PGXC_PARENT_NODE_ID;
+ OwnLatch(&sq->sq_sync->sqs_producer_latch);
+
+ i = 0;
+ foreach(lc, distNodes)
+ {
+ int nodeid = lfirst_int(lc);
+
+ /*
+ * Producer won't go to shared queue to hand off tuple to itself,
+ * so we do not need to create queue for that entry.
+ */
+ if (nodeid == PGXC_PARENT_NODE_ID)
+ {
+ /* Producer must be in the consNodes list */
+ Assert(list_member_int(consNodes, nodeid));
+ elog(DEBUG1, "SQueue %s consumer @%d is set to self",
+ sqname, i);
+ consMap[i++] = SQ_CONS_SELF;
+ }
+ /*
+ * This node may connect as a consumer, store consumer id to the map
+ * and initialize consumer queue
+ */
+ else if (list_member_int(consNodes, nodeid))
+ {
+ ConsState *cstate;
+ int j;
+
+ for (j = 0; j < sq->sq_nconsumers; j++)
+ {
+ cstate = &(sq->sq_consumers[j]);
+ if (cstate->cs_node == nodeid)
+ {
+ /* The process already reported that queue won't read */
+ elog(DEBUG1, "Node %d of SQueue %s is released already "
+ "at consumer %d, cs_status %d",
+ nodeid, sqname, j, cstate->cs_status);
+ consMap[i++] = SQ_CONS_NONE;
+ break;
+ }
+ else if (cstate->cs_node == -1)
+ {
+ /* found unused slot, assign the consumer to it */
+ elog(DEBUG1, "Node %d of SQueue %s is bound at consumer "
+ "%d, cs_status %d",
+ nodeid, sqname, j, cstate->cs_status);
+ consMap[i++] = j;
+ cstate->cs_node = nodeid;
+ break;
+ }
+ }
+ }
+ /*
+ * Consumer from this node won't ever connect as upper level step
+ * is not executed on the node. Discard resuls that may go to that
+ * node, if any.
+ */
+ else
+ {
+ elog(DEBUG1, "Node %d of SQueue %s is not in the "
+ "redistribution list and hence would never connect",
+ nodeid, sqname);
+ consMap[i++] = SQ_CONS_NONE;
+ }
+ }
+
+ if (myindex)
+ *myindex = -1;
+
+ /*
+ * Increment the refcnt only when producer binds. This is a bit
+ * asymmetrical, but the way things are currently setup, a consumer
+ * though calls SharedQueueBind, never calls SharedQueueUnBind. The
+ * unbinding is done only by the producer after it waits for all
+ * consumers to finish.
+ *
+ * XXX This ought to be fixed someday to simplify things in Shared
+ * Queue handling
+ */
+ sq->sq_refcnt++;
+ }
+ else
+ {
+ int nconsumers;
+ ListCell *lc;
+
+ /* Producer should be different process */
+ Assert(sq->sq_pid != MyProcPid);
+
+ elog(DEBUG1, "SQueue %s has a bound producer from node %d, pid %d",
+ sqname, sq->sq_nodeid, sq->sq_pid);
+ elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d", PGXC_PARENT_NODE, sqname, sq->sq_pid);
+
+ /* Sanity checks */
+ Assert(myindex);
+ *myindex = -1;
+ /* Ensure the passed in consumer list matches the queue */
+ nconsumers = 0;
+ foreach (lc, consNodes)
+ {
+ int nodeid = lfirst_int(lc);
+ int i;
+
+ if (nodeid == sq->sq_nodeid)
+ {
+ /*
+ * This node is a producer it should be in the consumer list,
+ * but no consumer queue for it
+ */
+ continue;
+ }
+
+ /* find consumer queue for the node */
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == nodeid)
+ {
+ nconsumers++;
+ if (nodeid == PGXC_PARENT_NODE_ID)
+ {
+ /*
+ * Current consumer queue is that from which current
+ * session will be sending out data rows.
+ * Initialize the queue to let producer know we are
+ * here and runnng.
+ */
+ SQueueSync *sqsync = sq->sq_sync;
+
+ elog(DEBUG1, "SQueue %s, consumer node %d is same as "
+ "the parent node", sqname, nodeid);
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ /* Make sure no consumer bound to the queue already */
+ Assert(cstate->cs_pid == 0);
+ /* make sure the queue is ready to read */
+ Assert(cstate->cs_qlength > 0);
+ /* verify status */
+ if (cstate->cs_status == CONSUMER_ERROR ||
+ cstate->cs_status == CONSUMER_DONE)
+ {
+ int status = cstate->cs_status;
+ /*
+ * Producer failed by the time the consumer connect.
+ * Change status to "Done" to allow producer unbind
+ * and report problem to the parent.
+ */
+ cstate->cs_status = CONSUMER_DONE;
+ /* Producer may be waiting for status change */
+ SetLatch(&sqsync->sqs_producer_latch);
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ LWLockRelease(SQueuesLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_PRODUCER_ERROR),
+ errmsg("Producer failed while we were waiting - status was %d", status)));
+ }
+ /*
+ * Any other status is acceptable. Normally it would be
+ * ACTIVE. If producer have had only few rows to emit
+ * and it is already done the status would be EOF.
+ */
+
+ /* Set up the consumer */
+ cstate->cs_pid = MyProcPid;
+
+ elog(DEBUG1, "SQueue %s, consumer at %d, status %d - "
+ "setting up consumer node %d, pid %d",
+ sqname, i, cstate->cs_status, cstate->cs_node,
+ cstate->cs_pid);
+ /* return found index */
+ *myindex = i;
+ OwnLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ else
+ elog(DEBUG1, "SQueue %s, consumer node %d is not same as "
+ "the parent node %d", sqname, nodeid,
+ PGXC_PARENT_NODE_ID);
+ break;
+ }
+ }
+ /* Check if entry was found and therefore loop was broken */
+ Assert(i < sq->sq_nconsumers);
+ }
+ /* Check the consumer is found */
+ Assert(*myindex != -1);
+ Assert(sq->sq_nconsumers == nconsumers);
+ }
+ LWLockRelease(SQueuesLock);
+ return sq;
+}
+
+
+/*
+ * Push data from the local tuplestore to the queue for specified consumer.
+ * Return true if succeeded and the tuplestore is now empty. Return false
+ * if specified queue has not enough room for the next tuple.
+ */
+static bool
+SharedQueueDump(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *tmpslot, Tuplestorestate *tuplestore)
+{
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+
+ elog(DEBUG3, "Dumping SQueue %s data for consumer at %d, "
+ "producer - node %d, pid %d, "
+ "consumer - node %d, pid %d, status %d",
+ squeue->sq_key, consumerIdx,
+ squeue->sq_nodeid, squeue->sq_pid,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+
+ /* discard stored data if consumer is not active */
+ if (cstate->cs_status != CONSUMER_ACTIVE)
+ {
+ elog(DEBUG3, "Discarding SQueue %s data for consumer at %d not active",
+ squeue->sq_key, consumerIdx);
+ tuplestore_clear(tuplestore);
+ return true;
+ }
+
+ /*
+ * Tuplestore does not clear eof flag on the active read pointer, causing
+ * the store is always in EOF state once reached when there is a single
+ * read pointer. We do not want behavior like this and workaround by using
+ * secondary read pointer. Primary read pointer (0) is active when we are
+ * writing to the tuple store, also it is used to bookmark current position
+ * when reading to be able to roll back and return just read tuple back to
+ * the store if we failed to write it out to the queue.
+ * Secondary read pointer is for reading, and its eof flag is cleared if a
+ * tuple is written to the store.
+ */
+ tuplestore_select_read_pointer(tuplestore, 1);
+
+ /* If we have something in the tuplestore try to push this to the queue */
+ while (!tuplestore_ateof(tuplestore))
+ {
+ /* save position */
+ tuplestore_copy_read_pointer(tuplestore, 1, 0);
+
+ /* Try to get next tuple to the temporary slot */
+ if (!tuplestore_gettupleslot(tuplestore, true, false, tmpslot))
+ {
+ /* false means the tuplestore in EOF state */
+ elog(DEBUG3, "Tuplestore for SQueue %s returned EOF",
+ squeue->sq_key);
+ break;
+ }
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_reads++;
+#endif
+
+ /* The slot should contain a data row */
+ Assert(tmpslot->tts_datarow);
+
+ /* check if queue has enough room for the data */
+ if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + tmpslot->tts_datarow->msglen)
+ {
+ /*
+ * If stored tuple does not fit empty queue we are entering special
+ * procedure of pushing it through.
+ */
+ if (cstate->cs_ntuples <= 0)
+ {
+ /*
+ * If pushing throw is completed wake up and proceed to next
+ * tuple, there could be enough space in the consumer queue to
+ * fit more.
+ */
+ bool done = sq_push_long_tuple(cstate, tmpslot->tts_datarow);
+
+ /*
+ * sq_push_long_tuple writes some data anyway, so wake up
+ * the consumer.
+ */
+ SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
+
+ if (done)
+ continue;
+ }
+
+ /* Restore read position to get same tuple next time */
+ tuplestore_copy_read_pointer(tuplestore, 0, 1);
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_returns++;
+#endif
+
+ /* We might advance the mark, try to truncate */
+ tuplestore_trim(tuplestore);
+
+ /* Prepare for writing, set proper read pointer */
+ tuplestore_select_read_pointer(tuplestore, 0);
+
+ /* ... and exit */
+ return false;
+ }
+ else
+ {
+ /* Enqueue data */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &tmpslot->tts_datarow->msglen);
+ QUEUE_WRITE(cstate, tmpslot->tts_datarow->msglen, tmpslot->tts_datarow->msg);
+
+ /* Increment tuple counter. If it was 0 consumer may be waiting for
+ * data so try to wake it up */
+ if ((cstate->cs_ntuples)++ == 0)
+ SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
+ }
+ }
+
+ /* Remove rows we have just read */
+ tuplestore_trim(tuplestore);
+
+ /* prepare for writes, set read pointer 0 as active */
+ tuplestore_select_read_pointer(tuplestore, 0);
+
+ return true;
+}
+
+
+/*
+ * SharedQueueWrite
+ * Write data from the specified slot to the specified queue. If the
+ * tuplestore passed in has tuples try and write them first.
+ * If specified queue is full the tuple is put into the tuplestore which is
+ * created if necessary
+ */
+void
+SharedQueueWrite(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *slot, Tuplestorestate **tuplestore,
+ MemoryContext tmpcxt)
+{
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+ SQueueSync *sqsync = squeue->sq_sync;
+ LWLockId clwlock = sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock;
+ RemoteDataRow datarow;
+ bool free_datarow;
+
+ Assert(cstate->cs_qlength > 0);
+
+ LWLockAcquire(clwlock, LW_EXCLUSIVE);
+
+#ifdef SQUEUE_STAT
+ cstate->stat_writes++;
+#endif
+
+ /*
+ * If we have anything in the local storage try to dump this first,
+ * but do not try to dump often to avoid overhead of creating temporary
+ * tuple slot. It should be OK to dump if queue is half empty.
+ */
+ if (*tuplestore)
+ {
+ bool dumped = false;
+
+ if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
+ {
+ TupleTableSlot *tmpslot;
+
+ tmpslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor);
+ dumped = SharedQueueDump(squeue, consumerIdx, tmpslot, *tuplestore);
+ ExecDropSingleTupleTableSlot(tmpslot);
+ }
+ if (!dumped)
+ {
+ /* No room to even dump local store, append the tuple to the store
+ * and exit */
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_writes++;
+#endif
+ LWLockRelease(clwlock);
+ tuplestore_puttupleslot(*tuplestore, slot);
+ return;
+ }
+ }
+
+ /* Get datarow from the tuple slot */
+ if (slot->tts_datarow)
+ {
+ /*
+ * The function ExecCopySlotDatarow always make a copy, but here we
+ * can optimize and avoid copying the data, so we just get the reference
+ */
+ datarow = slot->tts_datarow;
+ free_datarow = false;
+ }
+ else
+ {
+ datarow = ExecCopySlotDatarow(slot, tmpcxt);
+ free_datarow = true;
+ }
+ if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + datarow->msglen)
+ {
+ /* Not enough room, store tuple locally */
+ LWLockRelease(clwlock);
+
+ /* clean up */
+ if (free_datarow)
+ pfree(datarow);
+
+ /* Create tuplestore if does not exist */
+ if (*tuplestore == NULL)
+ {
+ int ptrno;
+ char storename[64];
+
+#ifdef SQUEUE_STAT
+ elog(DEBUG1, "Start buffering %s node %d, %d tuples in queue, %ld writes and %ld reads so far",
+ squeue->sq_key, cstate->cs_node, cstate->cs_ntuples, cstate->stat_writes, cstate->stat_reads);
+#endif
+ *tuplestore = tuplestore_begin_datarow(false, work_mem, tmpcxt);
+ /* We need is to be able to remember/restore the read position */
+ snprintf(storename, 64, "%s node %d", squeue->sq_key, cstate->cs_node);
+ tuplestore_collect_stat(*tuplestore, storename);
+ /*
+ * Allocate a second read pointer to read from the store. We know
+ * it must have index 1, so needn't store that.
+ */
+ ptrno = tuplestore_alloc_read_pointer(*tuplestore, 0);
+ Assert(ptrno == 1);
+ }
+
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_writes++;
+#endif
+ /* Append the slot to the store... */
+ tuplestore_puttupleslot(*tuplestore, slot);
+
+ /* ... and exit */
+ return;
+ }
+ else
+ {
+ /* do not supply data to closed consumer */
+ if (cstate->cs_status == CONSUMER_ACTIVE)
+ {
+ elog(DEBUG3, "SQueue %s, consumer is active, writing data",
+ squeue->sq_key);
+ /* write out the data */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
+ QUEUE_WRITE(cstate, datarow->msglen, datarow->msg);
+ /* Increment tuple counter. If it was 0 consumer may be waiting for
+ * data so try to wake it up */
+ if ((cstate->cs_ntuples)++ == 0)
+ SetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ }
+ else
+ elog(DEBUG2, "SQueue %s, consumer is not active, no need to supply data",
+ squeue->sq_key);
+
+ /* clean up */
+ if (free_datarow)
+ pfree(datarow);
+ }
+ LWLockRelease(clwlock);
+}
+
+
+/*
+ * SharedQueueRead
+ * Read one data row from the specified queue into the provided tupleslot.
+ * Returns true if EOF is reached on the specified consumer queue.
+ * If the queue is empty, behavior is controlled by the canwait parameter.
+ * If canwait is true it is waiting while row is available or EOF or error is
+ * reported, if it is false, the slot is emptied and false is returned.
+ */
+bool
+SharedQueueRead(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *slot, bool canwait)
+{
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+ SQueueSync *sqsync = squeue->sq_sync;
+ RemoteDataRow datarow;
+ int datalen;
+
+ Assert(cstate->cs_qlength > 0);
+
+ LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
+
+ Assert(cstate->cs_status != CONSUMER_DONE);
+ while (cstate->cs_ntuples <= 0)
+ {
+ elog(DEBUG3, "SQueue %s, consumer node %d, pid %d, status %d - "
+ "no tuples in the queue", squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+
+ if (cstate->cs_status == CONSUMER_EOF)
+ {
+ elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - "
+ "EOF marked. Informing produer by setting CONSUMER_DONE",
+ squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ /* no need to receive notifications */
+ DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ /* producer done the job and no more rows expected, clean up */
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ ExecClearTuple(slot);
+ /*
+ * notify the producer, it may be waiting while consumers
+ * are finishing
+ */
+ SetLatch(&sqsync->sqs_producer_latch);
+ return true;
+ }
+ else if (cstate->cs_status == CONSUMER_ERROR)
+ {
+ elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - "
+ "CONSUMER_ERROR set",
+ squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ /*
+ * There was a producer error while waiting.
+ * Release all the locks and report problem to the caller.
+ */
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ /*
+ * Reporting error will cause transaction rollback and clean up of
+ * all portals. We can not mark the portal so it does not access
+ * the queue so we should hold it for now. We should prevent queue
+ * unbound in between.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_PRODUCER_ERROR),
+ errmsg("Failed to read from SQueue %s, "
+ "consumer (node %d, pid %d, status %d) - "
+ "CONSUMER_ERROR set",
+ squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status)));
+ }
+ if (canwait)
+ {
+ /* Prepare waiting on empty buffer */
+ ResetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+
+ elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
+ "no queued tuples to read, waiting "
+ "for producer to produce more data",
+ squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+
+ /* Wait for notification about available info */
- 10000L);
++ WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch,
++ WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
++ WAIT_EVENT_MQ_INTERNAL);
+ /* got the notification, restore lock and try again */
+ LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
+ }
+ else
+ {
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+
+ elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
+ "no queued tuples to read, caller can't wait ",
+ squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ ExecClearTuple(slot);
+ return false;
+ }
+ }
+
+ elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - "
+ "%d queued tuples to read",
+ squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status,
+ cstate->cs_ntuples);
+
+ /* have at least one row, read it in and store to slot */
+ QUEUE_READ(cstate, sizeof(int), (char *) (&datalen));
+ datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datalen);
+ datarow->msgnode = InvalidOid;
+ datarow->msglen = datalen;
+ if (datalen > cstate->cs_qlength - sizeof(int))
+ sq_pull_long_tuple(cstate, datarow,
+ &sqsync->sqs_consumer_sync[consumerIdx]);
+ else
+ QUEUE_READ(cstate, datalen, datarow->msg);
+ ExecStoreDataRowTuple(datarow, slot, true);
+ (cstate->cs_ntuples)--;
+#ifdef SQUEUE_STAT
+ cstate->stat_reads++;
+#endif
+ /* sanity check */
+ Assert((cstate->cs_ntuples == 0) == (cstate->cs_qreadpos == cstate->cs_qwritepos));
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ return false;
+}
+
+
+/*
+ * Mark specified consumer as closed discarding all input which may already be
+ * in the queue.
+ * If consumerIdx is -1 the producer is cleaned up. Producer need to wait for
+ * consumers before releasing the queue, so if there are yet active consumers,
+ * they are notified about the problem and they should disconnect from the
+ * queue as soon as possible.
+ */
+void
+SharedQueueReset(SharedQueue squeue, int consumerIdx)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+
+ /*
+ * We may have already cleaned up, but then an abort signalled us to clean up.
+ * Avoid segmentation fault on abort
+ */
+ if (!sqsync)
+ return;
+
+ if (consumerIdx == -1)
+ {
+ int i;
+
+ elog(DEBUG1, "SQueue %s, requested to reset producer node %d, pid %d - "
+ "Now also resetting all consumers",
+ squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid);
+
+ /* check queue states */
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+
+ /*
+ * If producer being reset before it is reached the end of the
+ * result set, that means consumer probably would not get all
+ * the rows and it should report error if the consumer's parent ever
+ * try to read. No need to raise error if consumer is just closed.
+ * If consumer is done already we do not need to change the status.
+ */
+ if (cstate->cs_status != CONSUMER_EOF &&
+ cstate->cs_status != CONSUMER_DONE)
+ {
+ elog(DEBUG1, "SQueue %s, reset consumer at %d, "
+ "consumer node %d, pid %d, status %d - marking CONSUMER_ERROR",
+ squeue->sq_key, i, cstate->cs_node, cstate->cs_pid,
+ cstate->cs_status);
+
+ cstate->cs_status = CONSUMER_ERROR;
+ /* discard tuples which may already be in the queue */
+ cstate->cs_ntuples = 0;
+ /* keep consistent with cs_ntuples*/
+ cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
+
+ /* wake up consumer if it is sleeping */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ }
+ else
+ {
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+
+ elog(DEBUG1, "SQueue %s, requested to reset consumer at %d, "
+ "consumer node %d, pid %d, status %d",
+ squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid,
+ cstate->cs_status);
+
+ LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock,
+ LW_EXCLUSIVE);
+
+ if (cstate->cs_status != CONSUMER_DONE)
+ {
+ elog(DEBUG1, "SQueue %s, consumer at %d, "
+ "consumer node %d, pid %d, status %d - marking CONSUMER_DONE",
+ squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid,
+ cstate->cs_status);
+
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ /*
+ * No longer need to receive notifications. If consumer has not
+ * connected the latch is not owned
+ */
+ if (cstate->cs_pid > 0)
+ DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ /*
+ * notify the producer, it may be waiting while consumers
+ * are finishing
+ */
+ SetLatch(&sqsync->sqs_producer_latch);
+ }
+
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ }
+}
+
+
+/*
+ * Assume that not yet connected consumers won't connect and reset them.
+ * That should allow to Finish/UnBind the queue gracefully and prevent
+ * producer hanging.
+ */
+void
+SharedQueueResetNotConnected(SharedQueue squeue)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ int result = 0;
+ int i;
+
+ elog(DEBUG1, "SQueue %s, resetting all unconnected consumers",
+ squeue->sq_key);
+
+ /* check queue states */
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+
+ if (cstate->cs_pid == 0 &&
+ cstate->cs_status != CONSUMER_EOF &&
+ cstate->cs_status != CONSUMER_DONE)
+ {
+ result++;
+ elog(DEBUG1, "SQueue %s, consumer at %d, consumer node %d, pid %d, "
+ "status %d is cancelled - marking CONSUMER_ERROR", squeue->sq_key, i,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ cstate->cs_status = CONSUMER_ERROR;
+ /* discard tuples which may already be in the queue */
+ cstate->cs_ntuples = 0;
+ /* keep consistent with cs_ntuples*/
+ cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
+
+ /* wake up consumer if it is sleeping */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+}
+
+
+/*
+ * Determine if producer can safely pause work.
+ * The producer can pause if all consumers have enough data to read while
+ * producer is sleeping.
+ * Obvoius case when the producer can not pause if at least one queue is empty.
+ */
+bool
+SharedQueueCanPause(SharedQueue squeue)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ bool result = true;
+ int usedspace;
+ int ncons;
+ int i;
+
+ usedspace = 0;
+ ncons = 0;
+ for (i = 0; result && (i < squeue->sq_nconsumers); i++)
+ {
+ ConsState *cstate = &(squeue->sq_consumers[i]);
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_SHARED);
+ /*
+ * Count only consumers that may be blocked.
+ * If producer has finished scanning and pushing local buffers some
+ * consumers may be finished already.
+ */
+ if (cstate->cs_status == CONSUMER_ACTIVE)
+ {
+ /* can not pause if some queue is empty */
+ result = (cstate->cs_ntuples > 0);
+ usedspace += (cstate->cs_qwritepos > cstate->cs_qreadpos ?
+ cstate->cs_qwritepos - cstate->cs_qreadpos :
+ cstate->cs_qlength + cstate->cs_qwritepos
+ - cstate->cs_qreadpos);
+ ncons++;
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+
+ if (!ncons)
+ return false;
+
+ /*
+ * Pause only if average consumer queue is full more then on half.
+ */
+ if (result)
+ result = (usedspace / ncons > squeue->sq_consumers[0].cs_qlength / 2);
+#ifdef SQUEUE_STAT
+ if (result)
+ squeue->stat_paused++;
+#endif
+ return result;
+}
+
+
+int
+SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
+ Tuplestorestate **tuplestore)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ TupleTableSlot *tmpslot = NULL;
+ int i;
+ int nstores = 0;
+
+ elog(DEBUG1, "SQueue %s, finishing the SQueue - producer node %d, "
+ "pid %d, nconsumers %d", squeue->sq_key, squeue->sq_nodeid,
+ squeue->sq_pid, squeue->sq_nconsumers);
+
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+#ifdef SQUEUE_STAT
+ if (!squeue->stat_finish)
+ elog(DEBUG1, "Finishing %s node %d, %ld writes and %ld reads so far, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer",
+ squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns);
+#endif
+ elog(DEBUG1, "SQueue %s finishing, consumer at %d, consumer node %d, pid %d, "
+ "status %d", squeue->sq_key, i,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ /*
+ * if the tuplestore has data and consumer queue has space for some
+ * try to push rows to the queue. We do not want to do that often
+ * to avoid overhead of temp tuple slot allocation.
+ */
+ if (tuplestore[i])
+ {
+ /* If the consumer is not reading just destroy the tuplestore */
+ if (cstate->cs_status != CONSUMER_ACTIVE)
+ {
+ tuplestore_end(tuplestore[i]);
+ tuplestore[i] = NULL;
+ }
+ else
+ {
+ nstores++;
+ /*
+ * Attempt to dump tuples from the store require tuple slot
+ * allocation, that is not a cheap operation, so proceed if
+ * target queue has enough space.
+ */
+ if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
+ {
+ if (tmpslot == NULL)
+ tmpslot = MakeSingleTupleTableSlot(tupDesc);
+ if (SharedQueueDump(squeue, i, tmpslot, tuplestore[i]))
+ {
+ tuplestore_end(tuplestore[i]);
+ tuplestore[i] = NULL;
+ cstate->cs_status = CONSUMER_EOF;
+ nstores--;
+ }
+ /* Consumer may be sleeping, wake it up */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ }
+ }
+ else
+ {
+ /* it set eof if not yet set */
+ if (cstate->cs_status == CONSUMER_ACTIVE)
+ {
+ cstate->cs_status = CONSUMER_EOF;
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ if (tmpslot)
+ ExecDropSingleTupleTableSlot(tmpslot);
+
+#ifdef SQUEUE_STAT
+ squeue->stat_finish = true;
+#endif
+
+ return nstores;
+}
+
+
+/*
+ * SharedQueueUnBind
+ * Cancel binding of current process to the shared queue. If the process
+ * was a producer it should pass in the array of tuplestores where tuples were
+ * queueed when it was unsafe to block. If any of the tuplestores holds data
+ * rows they are written to the queue. The length of the array of the
+ * tuplestores should be the same as the count of consumers. It is OK if some
+ * entries are NULL. When a consumer unbinds from the shared queue it should
+ * set the tuplestore parameter to NULL.
+ */
+void
+SharedQueueUnBind(SharedQueue squeue, bool failed)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ int wait_result = 0;
+ int i = 0;
+ int consumer_running = 0;
+
+ elog(DEBUG1, "SQueue %s, unbinding the SQueue (failed: %c) - producer node %d, "
+ "pid %d, nconsumers %d", squeue->sq_key, failed ? 'T' : 'F',
+ squeue->sq_nodeid, squeue->sq_pid, squeue->sq_nconsumers);
+
+CHECK:
+
+ /* loop while there are active consumers */
+ for (;;)
+ {
+ int i;
+ int c_count = 0;
+ int unbound_count = 0;
+
+ /* check queue states */
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+
+ elog(DEBUG1, "SQueue %s unbinding, check consumer at %d, consumer node %d, pid %d, "
+ "status %d", squeue->sq_key, i,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+
+ /* is consumer working yet ? */
+ if (cstate->cs_status == CONSUMER_ACTIVE && failed)
+ {
+ elog(DEBUG1, "SQueue %s, consumer status CONSUMER_ACTIVE, but "
+ "the operation has failed - marking CONSUMER_ERROR",
+ squeue->sq_key);
+
+ cstate->cs_status = CONSUMER_ERROR;
+ }
+
+ if (cstate->cs_status != CONSUMER_DONE)
+ {
+ elog(DEBUG1, "SQueue %s, consumer not yet done, wake it up and "
+ "wait for it to finish reading", squeue->sq_key);
+ c_count++;
+ /* Wake up consumer if it is sleeping */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ /* producer will continue waiting */
+ ResetLatch(&sqsync->sqs_producer_latch);
+
+ if (cstate->cs_pid == 0)
+ unbound_count++;
+ }
+
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ if (c_count == 0)
+ break;
+ elog(DEBUG1, "SQueue %s, wait while %d consumers finish, %d consumers"
+ "not yet bound", squeue->sq_key, c_count, unbound_count);
+ /* wait for a notification */
+ wait_result = WaitLatch(&sqsync->sqs_producer_latch,
+ WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT,
- WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
++ 10000L, WAIT_EVENT_MQ_INTERNAL);
+ if (wait_result & WL_TIMEOUT)
+ {
+ elog(WARNING, "SQueue %s, timeout while waiting for Consumers "
+ "finishing", squeue->sq_key);
+ break;
+ }
+ /* got notification, continue loop */
+ }
+#ifdef SQUEUE_STAT
+ elog(DEBUG1, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused);
+#endif
+ elog(DEBUG1, "SQueue %s, producer node %d, pid %d - unbound successfully",
+ squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid);
+
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+ /*
+ * In rear situation, after consumers just bind to the shared queue, the producer timeout and remove the shared queue.
+ * This will cause a SEGV in the consumer. So here recheck if there are some consumers binded to the queue, if so, we need to wait them to
+ * finish.
+ */
+ consumer_running = 0;
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+
+ /* found a consumer running */
+ if (CONSUMER_ACTIVE == cstate->cs_status && cstate->cs_pid != 0)
+ {
+ elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d, "
+ "started running after we finished unbind", squeue->sq_key,
+ cstate->cs_node, cstate->cs_pid, cstate->cs_status);
+ consumer_running++;
+ }
+
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+
+ if (consumer_running)
+ {
+ elog(DEBUG1, "SQueue %s have %d consumers started running after we "
+ "unbound, recheck now", squeue->sq_key, consumer_running);
+ LWLockRelease(SQueuesLock);
+ goto CHECK;
+ }
+
+ /* All is done, clean up */
+ DisownLatch(&sqsync->sqs_producer_latch);
+
+ if (--squeue->sq_refcnt == 0)
+ {
+ /* Now it is OK to remove hash table entry */
+ squeue->sq_sync = NULL;
+ sqsync->queue = NULL;
+ if (hash_search(SharedQueues, squeue->sq_key, HASH_REMOVE, NULL) != squeue)
+ elog(PANIC, "Shared queue data corruption");
+ }
+
+ LWLockRelease(SQueuesLock);
+}
+
+
+/*
+ * If queue with specified name still exists set mark respective consumer as
+ * "Done". Due to executor optimization consumer may never connect the queue,
+ * and should allow producer to finish it up if it is known the consumer will
+ * never connect.
+ */
+void
+SharedQueueRelease(const char *sqname)
+{
+ bool found;
+ volatile SharedQueue sq;
+
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+ sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
+ if (found)
+ {
+ volatile SQueueSync *sqsync = sq->sq_sync;
+ int i;
+
+ Assert(sqsync && sqsync->queue == sq);
+
+ elog(DEBUG1, "SQueue %s producer node %d, pid %d - requested to release",
+ sqname, sq->sq_nodeid, sq->sq_pid);
+
+ /*
+ * If the SharedQ is not bound, we can't just remove it because
+ * somebody might have just created a fresh entry and is going to bind
+ * to it soon. We assume that the future producer will eventually
+ * release the SharedQ
+ */
+ if (sq->sq_nodeid == -1)
+ {
+ elog(DEBUG1, "SQueue %s, producer not bound ", sqname);
+ goto done;
+ }
+
+ /*
+ * Do not bother releasing producer, all necessary work will be
+ * done upon UnBind.
+ */
+ if (sq->sq_nodeid != PGXC_PARENT_NODE_ID)
+ {
+ elog(DEBUG1, "SQueue %s, we are consumer from node %d", sqname,
+ PGXC_PARENT_NODE_ID);
+ /* find specified node in the consumer lists */
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == PGXC_PARENT_NODE_ID)
+ {
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, "
+ "status %d", sq->sq_key, cstate->cs_node,
+ cstate->cs_pid, cstate->cs_status);
+
+ /*
+ * If the consumer pid is not set, we are looking at a race
+ * condition where the old producer (which supplied the
+ * tuples to this remote datanode) may have finished and
+ * marked all consumers as CONSUMER_EOF, the consumers
+ * themeselves consumed all the tuples and marked
+ * themselves as CONSUMER_DONE. The old producer in that
+ * case may have actually removed the SharedQ from shared
+ * memory. But if a new execution for this same portal
+ * comes before the consumer sends a "Close Portal" message
+ * (which subsequently calls this function), we may end up
+ * corrupting state for the upcoming consumer for this new
+ * execution of the portal.
+ *
+ * It seems best to just ignore the release call in such
+ * cases.
+ */
+ if (cstate->cs_pid == 0)
+ {
+ elog(DEBUG1, "SQueue %s, consumer node %d, already released",
+ sq->sq_key, cstate->cs_node);
+ }
+ else if (cstate->cs_status != CONSUMER_DONE)
+ {
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ /* no need to receive notifications */
+ if (cstate->cs_pid > 0)
+ {
+ DisownLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ cstate->cs_pid = 0;
+ }
+ /*
+ * notify the producer, it may be waiting while
+ * consumers are finishing
+ */
+ SetLatch(&sqsync->sqs_producer_latch);
+ elog(DEBUG1, "SQueue %s, release consumer at %d, node "
+ "%d, pid %d, status %d ", sqname, i,
+ cstate->cs_node, cstate->cs_pid,
+ cstate->cs_status);
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ /* exit */
+ goto done;
+ }
+ }
+
+ elog(DEBUG1, "SQueue %s, consumer from node %d never bound",
+ sqname, PGXC_PARENT_NODE_ID);
+ /*
+ * The consumer was never bound. Find empty consumer slot and
+ * register node here to let producer know that the node will never
+ * be consuming.
+ */
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == -1)
+ {
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ SetLatch(&sqsync->sqs_producer_latch);
+ elog(DEBUG1, "SQueue %s, consumer at %d marking as "
+ "CONSUMER_DONE", sqname, i);
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ }
+ }
+ }
+done:
+ /*
+ * If we are the last holder of the SQueue, remove it from the hash table
+ * to avoid any leak
+ */
+ if (sq && --sq->sq_refcnt == 0)
+ {
+ /* Now it is OK to remove hash table entry */
+ sq->sq_sync->queue = NULL;
+ sq->sq_sync = NULL;
+ if (hash_search(SharedQueues, sq->sq_key, HASH_REMOVE, NULL) != sq)
+ elog(PANIC, "Shared queue data corruption");
+ }
+ LWLockRelease(SQueuesLock);
+}
+
+
+/*
+ * Called when the backend is ending.
+ */
+void
+SharedQueuesCleanup(int code, Datum arg)
+{
+ /* Need to be able to look into catalogs */
+ CurrentResourceOwner = ResourceOwnerCreate(NULL, "SharedQueuesCleanup");
+
+ /*
+ * Release all registered prepared statements.
+ * If a shared queue name is associated with the statement this queue will
+ * be released.
+ */
+ DropAllPreparedStatements();
+
+ /* Release everything */
+ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, true, true);
+ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_LOCKS, true, true);
+ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_AFTER_LOCKS, true, true);
+ CurrentResourceOwner = NULL;
+}
+
+
+/*
+ * sq_push_long_tuple
+ * Routine to push through the consumer state tuple longer the the consumer
+ * queue. Long tuple is written by a producer partially, and only when the
+ * consumer queue is empty.
+ * The consumer can determine that the tuple being read is long if the length
+ * of the tuple which is read before data is exceeding queue length.
+ * Consumers is switching to the long tuple mode and read in the portion of
+ * data which is already in the queue. After reading in each portion of data
+ * consumer sets cs_ntuples to LONG_TUPLE to indicate it is in long tuple
+ * mode, and writes out number of already read bytes to the beginning of the
+ * queue.
+ * While Consumer is reading in tuple data Producer may work on other task:
+ * execute query and send tuples to other Customers. If Producer sees the
+ * LONG_TUPLE indicator it may write out next portion. The tuple remains
+ * current in the tuplestore, and Producer just needs to read offset from
+ * the buffer to know what part of data to write next.
+ * After tuple is completely written the Producer is advancing to next tuple
+ * and continue operation in normal mode.
+ */
+static bool
+sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow)
+{
+ if (cstate->cs_ntuples == 0)
+ {
+ /* the tuple is too big to fit the queue, start pushing it through */
+ int len;
+ /*
+ * Output actual message size, to prepare consumer:
+ * allocate memory and set up transmission.
+ */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
+ /* Output as much as possible */
+ len = cstate->cs_qlength - sizeof(int);
+ Assert(datarow->msglen > len);
+ QUEUE_WRITE(cstate, len, datarow->msg);
+ cstate->cs_ntuples = 1;
+ return false;
+ }
+ else
+ {
+ int offset;
+ int len;
+
+ /* Continue pushing through long tuple */
+ Assert(cstate->cs_ntuples == LONG_TUPLE);
+ /*
+ * Consumer outputs number of bytes already read at the beginning of
+ * the queue.
+ */
+ memcpy(&offset, cstate->cs_qstart, sizeof(int));
+
+ Assert(offset > 0 && offset < datarow->msglen);
+
+ /* remaining data */
+ len = datarow->msglen - offset;
+ /*
+ * We are sending remaining lengs just for sanity check at the consumer
+ * side
+ */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &len);
+ if (len > cstate->cs_qlength - sizeof(int))
+ {
+ /* does not fit yet */
+ len = cstate->cs_qlength - sizeof(int);
+ QUEUE_WRITE(cstate, len, datarow->msg + offset);
+ cstate->cs_ntuples = 1;
+ return false;
+ }
+ else
+ {
+ /* now we are done */
+ QUEUE_WRITE(cstate, len, datarow->msg + offset);
+ cstate->cs_ntuples = 1;
+ return true;
+ }
+ }
+}
+
+
+/*
+ * sq_pull_long_tuple
+ * Read in from the queue data of a long tuple which does not the queue.
+ * See sq_push_long_tuple for more details
+ */
+static void
+sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
+ ConsumerSync *sync)
+{
+ int offset = 0;
+ int len = datarow->msglen;
+
+ for (;;)
+ {
+ /* determine how many bytes to read */
+ if (len > cstate->cs_qlength - sizeof(int))
+ len = cstate->cs_qlength - sizeof(int);
+
+ /* read data */
+ QUEUE_READ(cstate, len, datarow->msg + offset);
+
+ /* remember how many we read already */
+ offset += len;
+
+ /* check if we are done */
+ if (offset == datarow->msglen)
+ return;
+
+ /* need more, set up queue to accept data from the producer */
+ Assert(cstate->cs_ntuples == 1); /* allow exactly one incomplete tuple */
+ cstate->cs_ntuples = LONG_TUPLE; /* long tuple mode marker */
+ /* Inform producer how many bytes we have already */
+ memcpy(cstate->cs_qstart, &offset, sizeof(int));
+ /* Release locks and wait until producer supply more data */
+ while (cstate->cs_ntuples == LONG_TUPLE)
+ {
+ /* prepare wait */
+ ResetLatch(&sync->cs_latch);
+ LWLockRelease(sync->cs_lwlock);
+ /* Wait for notification about available info */
++ WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1,
++ WAIT_EVENT_MQ_INTERNAL);
+ /* got the notification, restore lock and try again */
+ LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE);
+ }
+ /* Read length of remaining data */
+ QUEUE_READ(cstate, sizeof(int), (char *) &len);
+
+ /* Make sure we are doing the same tuple */
+ Assert(offset + len == datarow->msglen);
+
+ /* next iteration */
+ }
+}
* there is a window (caused by pgstat delay) on which a worker may choose a
* table that was already vacuumed; this is a bug in the current design.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
heap_endscan(relScan);
heap_close(classRel, AccessShareLock);
+#ifdef XCP
+ /*
+ * Coordinator needs to access Datanodes to process distributed table.
+ */
+ if (IS_PGXC_COORDINATOR)
+ {
+ InitMultinodeExecutor(false);
+ }
+#endif
+
+ /*
+ * Recheck orphan temporary tables, and if they still seem orphaned, drop
+ * them. We'll eat a transaction per dropped table, which might seem
+ * excessive, but we should only need to do anything as a result of a
+ * previous backend crash, so this should not happen often enough to
+ * justify "optimizing". Using separate transactions ensures that we
+ * don't bloat the lock table if there are many temp tables to be dropped,
+ * and it ensures that we don't lose work if a deletion attempt fails.
+ */
+ foreach(cell, orphan_oids)
+ {
+ Oid relid = lfirst_oid(cell);
+ Form_pg_class classForm;
+ int backendID;
+ ObjectAddress object;
+
+ /*
+ * Check for user-requested abort.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Try to lock the table. If we can't get the lock immediately,
+ * somebody else is using (or dropping) the table, so it's not our
+ * concern anymore. Having the lock prevents race conditions below.
+ */
+ if (!ConditionalLockRelationOid(relid, AccessExclusiveLock))
+ continue;
+
+ /*
+ * Re-fetch the pg_class tuple and re-check whether it still seems to
+ * be an orphaned temp table. If it's not there or no longer the same
+ * relation, ignore it.
+ */
+ tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+ if (!HeapTupleIsValid(tuple))
+ {
+ /* be sure to drop useless lock so we don't bloat lock table */
+ UnlockRelationOid(relid, AccessExclusiveLock);
+ continue;
+ }
+ classForm = (Form_pg_class) GETSTRUCT(tuple);
+
+ /*
+ * Make all the same tests made in the loop above. In event of OID
+ * counter wraparound, the pg_class entry we have now might be
+ * completely unrelated to the one we saw before.
+ */
+ if (!((classForm->relkind == RELKIND_RELATION ||
+ classForm->relkind == RELKIND_MATVIEW) &&
+ classForm->relpersistence == RELPERSISTENCE_TEMP))
+ {
+ UnlockRelationOid(relid, AccessExclusiveLock);
+ continue;
+ }
+ backendID = GetTempNamespaceBackendId(classForm->relnamespace);
+ if (!(backendID != InvalidBackendId &&
+ (backendID == MyBackendId ||
+ BackendIdGetProc(backendID) == NULL)))
+ {
+ UnlockRelationOid(relid, AccessExclusiveLock);
+ continue;
+ }
+
+ /* OK, let's delete it */
+ ereport(LOG,
+ (errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"",
+ get_database_name(MyDatabaseId),
+ get_namespace_name(classForm->relnamespace),
+ NameStr(classForm->relname))));
+
+ object.classId = RelationRelationId;
+ object.objectId = relid;
+ object.objectSubId = 0;
+ performDeletion(&object, DROP_CASCADE,
+ PERFORM_DELETION_INTERNAL |
+ PERFORM_DELETION_QUIETLY |
+ PERFORM_DELETION_SKIP_EXTENSIONS);
+
+ /*
+ * To commit the deletion, end current transaction and start a new
+ * one. Note this also releases the lock we took.
+ */
+ CommitTransactionCommand();
+ StartTransactionCommand();
+
+ /* StartTransactionCommand changed current memory context */
+ MemoryContextSwitchTo(AutovacMemCxt);
+ }
+
/*
* Create a buffer access strategy object for VACUUM to use. We want to
* use the same one across all the vacuum operations we perform, since the
--- /dev/null
- (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L));
+/*-------------------------------------------------------------------------
+ *
+ * clustermon.c
+ *
+ * Postgres-XL Cluster Monitor
+ *
+ * Portions Copyright (c) 2015, 2ndQuadrant Ltd
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/postmaster/clustermon.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "access/gtm.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_gxid.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgxc/pgxc.h"
+#include "postmaster/clustermon.h"
+#include "postmaster/fork_process.h"
+#include "postmaster/postmaster.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
++#include "pgstat.h"
+
+/* Flags to tell if we are in a clustermon process */
+static bool am_clustermon = false;
+
+/* Flags set by signal handlers */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t got_SIGTERM = false;
+
+/* Memory context for long-lived data */
+static MemoryContext ClusterMonitorMemCxt;
+static ClusterMonitorCtlData *ClusterMonitorCtl = NULL;
+
+static void cm_sighup_handler(SIGNAL_ARGS);
+static void cm_sigterm_handler(SIGNAL_ARGS);
+static void ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin);
+static void ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin);
+
+/* PID of clustser monitoring process */
+int ClusterMonitorPid = 0;
+
+#define CLUSTER_MONITOR_NAPTIME 5
+
+/*
+ * Main loop for the cluster monitor process.
+ */
+int
+ClusterMonitorInit(void)
+{
+ sigjmp_buf local_sigjmp_buf;
+ GTM_PGXCNodeType nodetype = IS_PGXC_DATANODE ?
+ GTM_NODE_DATANODE :
+ GTM_NODE_COORDINATOR;
+ GlobalTransactionId oldestXmin;
+ GlobalTransactionId newOldestXmin;
+ GlobalTransactionId lastGlobalXmin;
+ GlobalTransactionId latestCompletedXid;
+ int status;
+
+ am_clustermon = true;
+
+ /* Identify myself via ps */
+ init_ps_display("cluster monitor process", "", "", "");
+
+ ereport(LOG,
+ (errmsg("cluster monitor started")));
+
+ if (PostAuthDelay)
+ pg_usleep(PostAuthDelay * 1000000L);
+
+ /*
+ * Set up signal handlers. We operate on databases much like a regular
+ * backend, so we use the same signal handling. See equivalent code in
+ * tcop/postgres.c.
+ */
+ pqsignal(SIGHUP, cm_sighup_handler);
+ pqsignal(SIGINT, StatementCancelHandler);
+ pqsignal(SIGTERM, cm_sigterm_handler);
+
+ pqsignal(SIGQUIT, quickdie);
+ InitializeTimeouts(); /* establishes SIGALRM handler */
+
+ pqsignal(SIGPIPE, SIG_IGN);
+ pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+ pqsignal(SIGFPE, FloatExceptionHandler);
+ pqsignal(SIGCHLD, SIG_DFL);
+
+ /*
+ * Create a memory context that we will do all our work in. We do this so
+ * that we can reset the context during error recovery and thereby avoid
+ * possible memory leaks.
+ */
+ ClusterMonitorMemCxt = AllocSetContextCreate(TopMemoryContext,
+ "Cluster Monitor",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ MemoryContextSwitchTo(ClusterMonitorMemCxt);
+
+ SetProcessingMode(NormalProcessing);
+
+ if (RegisterGTM(nodetype) < 0)
+ {
+ UnregisterGTM(nodetype);
+ if (RegisterGTM(nodetype) < 0)
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_IO_ERROR),
+ errmsg("Can not register node on GTM")));
+ }
+ }
+
+ /*
+ * If an exception is encountered, processing resumes here.
+ *
+ * This code is a stripped down version of PostgresMain error recovery.
+ */
+ if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+ {
+ /* since not using PG_TRY, must reset error stack by hand */
+ error_context_stack = NULL;
+
+ /* Prevents interrupts while cleaning up */
+ HOLD_INTERRUPTS();
+
+ /* Forget any pending QueryCancel or timeout request */
+ disable_all_timeouts(false);
+ QueryCancelPending = false; /* second to avoid race condition */
+
+ /* Report the error to the server log */
+ EmitErrorReport();
+
+ /*
+ * Now return to normal top-level context and clear ErrorContext for
+ * next time.
+ */
+ MemoryContextSwitchTo(ClusterMonitorMemCxt);
+ FlushErrorState();
+
+ /* Flush any leaked data in the top-level context */
+ MemoryContextResetAndDeleteChildren(ClusterMonitorMemCxt);
+
+ /* Now we can allow interrupts again */
+ RESUME_INTERRUPTS();
+
+ /* if in shutdown mode, no need for anything further; just go away */
+ if (got_SIGTERM)
+ goto shutdown;
+
+ /*
+ * Sleep at least 1 second after any error. We don't want to be
+ * filling the error logs as fast as we can.
+ */
+ pg_usleep(1000000L);
+ }
+
+ /* We can now handle ereport(ERROR) */
+ PG_exception_stack = &local_sigjmp_buf;
+
+ /* must unblock signals before calling rebuild_database_list */
+ PG_SETMASK(&UnBlockSig);
+
+ /*
+ * Force statement_timeout and lock_timeout to zero to avoid letting these
+ * settings prevent regular maintenance from being executed.
+ */
+ SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
+ SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
+
+ /* loop until shutdown request */
+ while (!got_SIGTERM)
+ {
+ struct timeval nap;
+ int rc;
+
+ /*
+ * Repeat at CLUSTER_MONITOR_NAPTIME seconds interval
+ */
+ nap.tv_sec = CLUSTER_MONITOR_NAPTIME;
+ nap.tv_usec = 0;
+
+ /*
+ * Wait until naptime expires or we get some type of signal (all the
+ * signal handlers will wake us by calling SetLatch).
+ */
+ rc = WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
- oldestXmin = GetOldestXminInternal(NULL, false, true, lastGlobalXmin);
++ (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
++ WAIT_EVENT_CLUSTER_MONITOR_MAIN);
+
+ ResetLatch(MyLatch);
+
+ /* Process sinval catchup interrupts that happened while sleeping */
+ ProcessCatchupInterrupt();
+
+ /*
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
+ */
+ if (rc & WL_POSTMASTER_DEATH)
+ proc_exit(1);
+
+ /* the normal shutdown case */
+ if (got_SIGTERM)
+ break;
+
+ if (got_SIGHUP)
+ {
+ got_SIGHUP = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ }
+
+ /*
+ * Compute RecentGlobalXmin, report it to the GTM and sleep for the set
+ * interval. Keep doing this forever
+ */
+ lastGlobalXmin = ClusterMonitorGetGlobalXmin();
+ LWLockAcquire(ClusterMonitorLock, LW_EXCLUSIVE);
++ oldestXmin = GetOldestXminInternal(NULL, 0, true, lastGlobalXmin);
+ ClusterMonitorSetReportingGlobalXmin(oldestXmin);
+ LWLockRelease(ClusterMonitorLock);
+
+ if ((status = ReportGlobalXmin(oldestXmin, &newOldestXmin,
+ &latestCompletedXid)))
+ {
+ elog(DEBUG1, "Failed (status %d) to report RecentGlobalXmin "
+ "- reported RecentGlobalXmin %d, received "
+ "RecentGlobalXmin %d, " "received latestCompletedXid %d",
+ status, oldestXmin, newOldestXmin,
+ latestCompletedXid);
+ if (status == GTM_ERRCODE_TOO_OLD_XMIN ||
+ status == GTM_ERRCODE_NODE_EXCLUDED)
+ {
+ /*
+ * If we haven't seen a new transaction for a very long time or
+ * were disconncted for a while or excluded from the xmin
+ * computation for any reason, our xmin calculation could be
+ * well in the past, especially because its capped by the
+ * latestCompletedXid which may not advance on an idle server.
+ * In such cases, use the value of latestCompletedXid as
+ * returned by GTM and then recompute local xmin.
+ *
+ * If the GTM's global xmin advances even further while we are
+ * ready with a new xmin, just repeat the entire exercise as
+ * long as GTM keeps returning us a more current value of
+ * latestCompletedXid and thus pushing forward our local xmin
+ * calculation
+ */
+ if (GlobalTransactionIdIsValid(latestCompletedXid) &&
+ TransactionIdPrecedes(oldestXmin, latestCompletedXid))
+ {
+ SetLatestCompletedXid(latestCompletedXid);
+ continue;
+ }
+ }
+ }
+ else
+ {
+ elog(DEBUG1, "Successfully reported xmin to GTM - reported_xmin %d,"
+ "received RecentGlobalXmin %d, "
+ "received latestCompletedXid %d", oldestXmin,
+ newOldestXmin, latestCompletedXid);
+
+ SetLatestCompletedXid(latestCompletedXid);
+ ClusterMonitorSetReportedGlobalXmin(oldestXmin);
+ if (GlobalTransactionIdIsValid(newOldestXmin))
+ ClusterMonitorSetGlobalXmin(newOldestXmin);
+ }
+
+ ClusterMonitorSetReportingGlobalXmin(InvalidGlobalTransactionId);
+
+ }
+
+ /* Normal exit from the cluster monitor is here */
+shutdown:
+ UnregisterGTM(nodetype);
+ ereport(LOG,
+ (errmsg("cluster monitor shutting down")));
+
+ proc_exit(0); /* done */
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+cm_sighup_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ got_SIGHUP = true;
+ SetLatch(MyLatch);
+
+ errno = save_errno;
+}
+
+/* SIGTERM: time to die */
+static void
+cm_sigterm_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ got_SIGTERM = true;
+ SetLatch(MyLatch);
+
+ errno = save_errno;
+}
+
+
+/*
+ * IsClusterMonitor functions
+ * Return whether this is either a cluster monitor process or a worker
+ * process.
+ */
+bool
+IsClusterMonitorProcess(void)
+{
+ return am_clustermon;
+}
+
+/* Report shared-memory space needed by ClusterMonitor */
+Size
+ClusterMonitorShmemSize(void)
+{
+ return sizeof (ClusterMonitorCtlData);
+}
+
+void
+ClusterMonitorShmemInit(void)
+{
+ bool found;
+
+ ClusterMonitorCtl = (ClusterMonitorCtlData *)
+ ShmemInitStruct("Cluster Monitor Ctl", ClusterMonitorShmemSize(), &found);
+
+ if (!found)
+ {
+ /* First time through, so initialize */
+ MemSet(ClusterMonitorCtl, 0, ClusterMonitorShmemSize());
+ SpinLockInit(&ClusterMonitorCtl->mutex);
+ }
+}
+
+GlobalTransactionId
+ClusterMonitorGetGlobalXmin(void)
+{
+ GlobalTransactionId xmin;
+
+ SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ xmin = ClusterMonitorCtl->gtm_recent_global_xmin;
+ SpinLockRelease(&ClusterMonitorCtl->mutex);
+
+ return xmin;
+}
+
+void
+ClusterMonitorSetGlobalXmin(GlobalTransactionId xmin)
+{
+ /*
+ * First extend the commit logs. Even though we may not have actually
+ * started any transactions in the new range, we must still extend the logs
+ * so that later operations which rely on the RecentGlobalXmin to truncate
+ * the logs work correctly.
+ */
+ ExtendLogs(xmin);
+
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Do a consistency check to ensure that we NEVER have running transactions
+ * with xmin less than what the GTM has already computed. While during
+ * normal execution, this should never happen, if we ever been excluded
+ * from the xmin calculation by the GTM while we are still running old
+ * transactions, PANIC is our best bet to avoid corruption
+ */
+ ProcArrayCheckXminConsistency(xmin);
+
+ SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ ClusterMonitorCtl->gtm_recent_global_xmin = xmin;
+ SpinLockRelease(&ClusterMonitorCtl->mutex);
+
+ LWLockRelease(ProcArrayLock);
+}
+
+static void
+ClusterMonitorSetReportedGlobalXmin(GlobalTransactionId xmin)
+{
+ elog(DEBUG2, "ClusterMonitorSetReportedGlobalXmin - old %d, new %d",
+ ClusterMonitorCtl->reported_recent_global_xmin,
+ xmin);
+ SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ ClusterMonitorCtl->reported_recent_global_xmin = xmin;
+ SpinLockRelease(&ClusterMonitorCtl->mutex);
+}
+
+static void
+ClusterMonitorSetReportingGlobalXmin(GlobalTransactionId xmin)
+{
+ elog(DEBUG2, "ClusterMonitorSetReportingGlobalXmin - old %d, new %d",
+ ClusterMonitorCtl->reporting_recent_global_xmin,
+ xmin);
+ SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ ClusterMonitorCtl->reporting_recent_global_xmin = xmin;
+ SpinLockRelease(&ClusterMonitorCtl->mutex);
+}
+
+GlobalTransactionId
+ClusterMonitorGetReportingGlobalXmin(void)
+{
+ GlobalTransactionId reporting_xmin;
+
+ SpinLockAcquire(&ClusterMonitorCtl->mutex);
+ reporting_xmin = ClusterMonitorCtl->reporting_recent_global_xmin;
+ SpinLockRelease(&ClusterMonitorCtl->mutex);
+
+ return reporting_xmin;
+}
* - Add a pgstat config column to pg_database, so this
* entire thing can be enabled/disabled on a per db basis.
*
- * Copyright (c) 2001-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Copyright (c) 2001-2017, PostgreSQL Global Development Group
*
* src/backend/postmaster/pgstat.c
* ----------
return event_name;
}
+ /* ----------
+ * pgstat_get_wait_activity() -
+ *
+ * Convert WaitEventActivity to string.
+ * ----------
+ */
+ static const char *
+ pgstat_get_wait_activity(WaitEventActivity w)
+ {
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_ARCHIVER_MAIN:
+ event_name = "ArchiverMain";
+ break;
+ case WAIT_EVENT_AUTOVACUUM_MAIN:
+ event_name = "AutoVacuumMain";
+ break;
+ case WAIT_EVENT_BGWRITER_HIBERNATE:
+ event_name = "BgWriterHibernate";
+ break;
+ case WAIT_EVENT_BGWRITER_MAIN:
+ event_name = "BgWriterMain";
+ break;
+ case WAIT_EVENT_CHECKPOINTER_MAIN:
+ event_name = "CheckpointerMain";
+ break;
+ case WAIT_EVENT_PGSTAT_MAIN:
+ event_name = "PgStatMain";
+ break;
+ case WAIT_EVENT_RECOVERY_WAL_ALL:
+ event_name = "RecoveryWalAll";
+ break;
+ case WAIT_EVENT_RECOVERY_WAL_STREAM:
+ event_name = "RecoveryWalStream";
+ break;
+ case WAIT_EVENT_SYSLOGGER_MAIN:
+ event_name = "SysLoggerMain";
+ break;
+ case WAIT_EVENT_WAL_RECEIVER_MAIN:
+ event_name = "WalReceiverMain";
+ break;
+ case WAIT_EVENT_WAL_SENDER_MAIN:
+ event_name = "WalSenderMain";
+ break;
+ case WAIT_EVENT_WAL_WRITER_MAIN:
+ event_name = "WalWriterMain";
+ break;
+ case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN:
+ event_name = "LogicalLauncherMain";
+ break;
+ case WAIT_EVENT_LOGICAL_APPLY_MAIN:
+ event_name = "LogicalApplyMain";
+ break;
++ case WAIT_EVENT_CLUSTER_MONITOR_MAIN:
++ event_name = "ClusterMonitorMain";
++ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+ }
+
+ /* ----------
+ * pgstat_get_wait_client() -
+ *
+ * Convert WaitEventClient to string.
+ * ----------
+ */
+ static const char *
+ pgstat_get_wait_client(WaitEventClient w)
+ {
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_CLIENT_READ:
+ event_name = "ClientRead";
+ break;
+ case WAIT_EVENT_CLIENT_WRITE:
+ event_name = "ClientWrite";
+ break;
+ case WAIT_EVENT_SSL_OPEN_SERVER:
+ event_name = "SSLOpenServer";
+ break;
+ case WAIT_EVENT_WAL_RECEIVER_WAIT_START:
+ event_name = "WalReceiverWaitStart";
+ break;
+ case WAIT_EVENT_LIBPQWALRECEIVER:
+ event_name = "LibPQWalReceiver";
+ break;
+ case WAIT_EVENT_WAL_SENDER_WAIT_WAL:
+ event_name = "WalSenderWaitForWAL";
+ break;
+ case WAIT_EVENT_WAL_SENDER_WRITE_DATA:
+ event_name = "WalSenderWriteData";
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+ }
+
+ /* ----------
+ * pgstat_get_wait_ipc() -
+ *
+ * Convert WaitEventIPC to string.
+ * ----------
+ */
+ static const char *
+ pgstat_get_wait_ipc(WaitEventIPC w)
+ {
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_BGWORKER_SHUTDOWN:
+ event_name = "BgWorkerShutdown";
+ break;
+ case WAIT_EVENT_BGWORKER_STARTUP:
+ event_name = "BgWorkerStartup";
+ break;
+ case WAIT_EVENT_BTREE_PAGE:
+ event_name = "BtreePage";
+ break;
+ case WAIT_EVENT_EXECUTE_GATHER:
+ event_name = "ExecuteGather";
+ break;
+ case WAIT_EVENT_MQ_INTERNAL:
+ event_name = "MessageQueueInternal";
+ break;
+ case WAIT_EVENT_MQ_PUT_MESSAGE:
+ event_name = "MessageQueuePutMessage";
+ break;
+ case WAIT_EVENT_MQ_RECEIVE:
+ event_name = "MessageQueueReceive";
+ break;
+ case WAIT_EVENT_MQ_SEND:
+ event_name = "MessageQueueSend";
+ break;
+ case WAIT_EVENT_PARALLEL_FINISH:
+ event_name = "ParallelFinish";
+ break;
+ case WAIT_EVENT_PARALLEL_BITMAP_SCAN:
+ event_name = "ParallelBitmapScan";
+ break;
+ case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
+ event_name = "ProcArrayGroupUpdate";
+ break;
+ case WAIT_EVENT_SAFE_SNAPSHOT:
+ event_name = "SafeSnapshot";
+ break;
+ case WAIT_EVENT_SYNC_REP:
+ event_name = "SyncRep";
+ break;
+ case WAIT_EVENT_LOGICAL_SYNC_DATA:
+ event_name = "LogicalSyncData";
+ break;
+ case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE:
+ event_name = "LogicalSyncStateChange";
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+ }
+
+ /* ----------
+ * pgstat_get_wait_timeout() -
+ *
+ * Convert WaitEventTimeout to string.
+ * ----------
+ */
+ static const char *
+ pgstat_get_wait_timeout(WaitEventTimeout w)
+ {
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_BASE_BACKUP_THROTTLE:
+ event_name = "BaseBackupThrottle";
+ break;
+ case WAIT_EVENT_PG_SLEEP:
+ event_name = "PgSleep";
+ break;
+ case WAIT_EVENT_RECOVERY_APPLY_DELAY:
+ event_name = "RecoveryApplyDelay";
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+ }
+
+ /* ----------
+ * pgstat_get_wait_io() -
+ *
+ * Convert WaitEventIO to string.
+ * ----------
+ */
+ static const char *
+ pgstat_get_wait_io(WaitEventIO w)
+ {
+ const char *event_name = "unknown wait event";
+
+ switch (w)
+ {
+ case WAIT_EVENT_BUFFILE_READ:
+ event_name = "BufFileRead";
+ break;
+ case WAIT_EVENT_BUFFILE_WRITE:
+ event_name = "BufFileWrite";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_READ:
+ event_name = "ControlFileRead";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_SYNC:
+ event_name = "ControlFileSync";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE:
+ event_name = "ControlFileSyncUpdate";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_WRITE:
+ event_name = "ControlFileWrite";
+ break;
+ case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE:
+ event_name = "ControlFileWriteUpdate";
+ break;
+ case WAIT_EVENT_COPY_FILE_READ:
+ event_name = "CopyFileRead";
+ break;
+ case WAIT_EVENT_COPY_FILE_WRITE:
+ event_name = "CopyFileWrite";
+ break;
+ case WAIT_EVENT_DATA_FILE_EXTEND:
+ event_name = "DataFileExtend";
+ break;
+ case WAIT_EVENT_DATA_FILE_FLUSH:
+ event_name = "DataFileFlush";
+ break;
+ case WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC:
+ event_name = "DataFileImmediateSync";
+ break;
+ case WAIT_EVENT_DATA_FILE_PREFETCH:
+ event_name = "DataFilePrefetch";
+ break;
+ case WAIT_EVENT_DATA_FILE_READ:
+ event_name = "DataFileRead";
+ break;
+ case WAIT_EVENT_DATA_FILE_SYNC:
+ event_name = "DataFileSync";
+ break;
+ case WAIT_EVENT_DATA_FILE_TRUNCATE:
+ event_name = "DataFileTruncate";
+ break;
+ case WAIT_EVENT_DATA_FILE_WRITE:
+ event_name = "DataFileWrite";
+ break;
+ case WAIT_EVENT_DSM_FILL_ZERO_WRITE:
+ event_name = "DSMFillZeroWrite";
+ break;
+ case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ:
+ event_name = "LockFileAddToDataDirRead";
+ break;
+ case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC:
+ event_name = "LockFileAddToDataDirSync";
+ break;
+ case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE:
+ event_name = "LockFileAddToDataDirWrite";
+ break;
+ case WAIT_EVENT_LOCK_FILE_CREATE_READ:
+ event_name = "LockFileCreateRead";
+ break;
+ case WAIT_EVENT_LOCK_FILE_CREATE_SYNC:
+ event_name = "LockFileCreateSync";
+ break;
+ case WAIT_EVENT_LOCK_FILE_CREATE_WRITE:
+ event_name = "LockFileCreateWRITE";
+ break;
+ case WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ:
+ event_name = "LockFileReCheckDataDirRead";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC:
+ event_name = "LogicalRewriteCheckpointSync";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC:
+ event_name = "LogicalRewriteMappingSync";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE:
+ event_name = "LogicalRewriteMappingWrite";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_SYNC:
+ event_name = "LogicalRewriteSync";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE:
+ event_name = "LogicalRewriteTruncate";
+ break;
+ case WAIT_EVENT_LOGICAL_REWRITE_WRITE:
+ event_name = "LogicalRewriteWrite";
+ break;
+ case WAIT_EVENT_RELATION_MAP_READ:
+ event_name = "RelationMapRead";
+ break;
+ case WAIT_EVENT_RELATION_MAP_SYNC:
+ event_name = "RelationMapSync";
+ break;
+ case WAIT_EVENT_RELATION_MAP_WRITE:
+ event_name = "RelationMapWrite";
+ break;
+ case WAIT_EVENT_REORDER_BUFFER_READ:
+ event_name = "ReorderBufferRead";
+ break;
+ case WAIT_EVENT_REORDER_BUFFER_WRITE:
+ event_name = "ReorderBufferWrite";
+ break;
+ case WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ:
+ event_name = "ReorderLogicalMappingRead";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_READ:
+ event_name = "ReplicationSlotRead";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC:
+ event_name = "ReplicationSlotRestoreSync";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_SYNC:
+ event_name = "ReplicationSlotSync";
+ break;
+ case WAIT_EVENT_REPLICATION_SLOT_WRITE:
+ event_name = "ReplicationSlotWrite";
+ break;
+ case WAIT_EVENT_SLRU_FLUSH_SYNC:
+ event_name = "SLRUFlushSync";
+ break;
+ case WAIT_EVENT_SLRU_READ:
+ event_name = "SLRURead";
+ break;
+ case WAIT_EVENT_SLRU_SYNC:
+ event_name = "SLRUSync";
+ break;
+ case WAIT_EVENT_SLRU_WRITE:
+ event_name = "SLRUWrite";
+ break;
+ case WAIT_EVENT_SNAPBUILD_READ:
+ event_name = "SnapbuildRead";
+ break;
+ case WAIT_EVENT_SNAPBUILD_SYNC:
+ event_name = "SnapbuildSync";
+ break;
+ case WAIT_EVENT_SNAPBUILD_WRITE:
+ event_name = "SnapbuildWrite";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC:
+ event_name = "TimelineHistoryFileSync";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE:
+ event_name = "TimelineHistoryFileWrite";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_READ:
+ event_name = "TimelineHistoryRead";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_SYNC:
+ event_name = "TimelineHistorySync";
+ break;
+ case WAIT_EVENT_TIMELINE_HISTORY_WRITE:
+ event_name = "TimelineHistoryWrite";
+ break;
+ case WAIT_EVENT_TWOPHASE_FILE_READ:
+ event_name = "TwophaseFileRead";
+ break;
+ case WAIT_EVENT_TWOPHASE_FILE_SYNC:
+ event_name = "TwophaseFileSync";
+ break;
+ case WAIT_EVENT_TWOPHASE_FILE_WRITE:
+ event_name = "TwophaseFileWrite";
+ break;
+ case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ:
+ event_name = "WALSenderTimelineHistoryRead";
+ break;
+ case WAIT_EVENT_WAL_BOOTSTRAP_SYNC:
+ event_name = "WALBootstrapSync";
+ break;
+ case WAIT_EVENT_WAL_BOOTSTRAP_WRITE:
+ event_name = "WALBootstrapWrite";
+ break;
+ case WAIT_EVENT_WAL_COPY_READ:
+ event_name = "WALCopyRead";
+ break;
+ case WAIT_EVENT_WAL_COPY_SYNC:
+ event_name = "WALCopySync";
+ break;
+ case WAIT_EVENT_WAL_COPY_WRITE:
+ event_name = "WALCopyWrite";
+ break;
+ case WAIT_EVENT_WAL_INIT_SYNC:
+ event_name = "WALInitSync";
+ break;
+ case WAIT_EVENT_WAL_INIT_WRITE:
+ event_name = "WALInitWrite";
+ break;
+ case WAIT_EVENT_WAL_READ:
+ event_name = "WALRead";
+ break;
+ case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN:
+ event_name = "WALSyncMethodAssign";
+ break;
+ case WAIT_EVENT_WAL_WRITE:
+ event_name = "WALWrite";
+ break;
+
+ /* no default case, so that compiler will warn */
+ }
+
+ return event_name;
+ }
+
+
/* ----------
* pgstat_get_backend_current_activity() -
*
* clients.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
#include "utils/dynamic_loader.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
+#ifdef PGXC
+#include "utils/resowner.h"
+#endif
#include "utils/timeout.h"
+ #include "utils/varlena.h"
#ifdef EXEC_BACKEND
#include "storage/spin.h"
StartupStatus = STARTUP_RUNNING;
pmState = PM_STARTUP;
+#ifdef PGXC /* PGXC_COORD */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Initialize the Data Node connection pool
+ */
+ PgPoolerPID = StartPoolManager();
+
+ MemoryContextSwitchTo(oldcontext);
+#endif /* PGXC */
+
/* Some workers may be scheduled to start now */
- maybe_start_bgworker();
+ maybe_start_bgworkers();
status = ServerLoop();
}
/* If we have lost the stats collector, try to start a new one */
- if (PgStatPID == 0 && pmState == PM_RUN)
+ if (PgStatPID == 0 &&
+ (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
PgStatPID = pgstat_start();
+#ifdef PGXC
+ /* If we have lost the pooler, try to start a new one */
+ if (PgPoolerPID == 0 && pmState == PM_RUN)
+ PgPoolerPID = StartPoolManager();
+#endif /* PGXC */
+
+#ifdef XCP
+ /* If we have lost the cluster monitor, try to start a new one */
+ if (ClusterMonPID == 0 && pmState == PM_RUN)
+ ClusterMonPID = StartClusterMonitor();
+#endif
+
/* If we have lost the archiver, try to start a new one. */
if (PgArchPID == 0 && PgArchStartupAllowed())
PgArchPID = pgarch_start();
PgArchPID = pgarch_start();
if (PgStatPID == 0)
PgStatPID = pgstat_start();
+#ifdef PGXC
+ if (PgPoolerPID == 0)
+ PgPoolerPID = StartPoolManager();
+#endif /* PGXC */
+
+#ifdef XCP
+ if (ClusterMonPID == 0)
+ ClusterMonPID = StartClusterMonitor();
+#endif
/* workers may be scheduled to start now */
- maybe_start_bgworker();
+ maybe_start_bgworkers();
/* at this point we are really open for business */
ereport(LOG,
#include "postgres.h"
#include "access/sysattr.h"
+ #include "catalog/dependency.h"
+#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
#include "foreign/fdwapi.h"
const char *attrname;
TargetEntry *tle;
+#ifdef PGXC
+ List *var_list = NIL;
+ ListCell *elt;
+
+ /*
+ * In Postgres-XC, we need to evaluate quals of the parse tree and determine
+ * if they are Coordinator quals. If they are, their attribute need to be
+ * added to target list for evaluation. In case some are found, add them as
+ * junks in the target list. The junk status will be used by remote UPDATE
+ * planning to associate correct element to a clause.
+ * For DELETE, having such columns in target list helps to evaluate Quals
+ * correctly on Coordinator.
+ * PGXCTODO: This list could be reduced to keep only in target list the
+ * vars using Coordinator Quals.
+ */
+ if (IS_PGXC_COORDINATOR && parsetree->jointree)
+ var_list = pull_qual_vars((Node *) parsetree->jointree, parsetree->resultRelation);
+
+ foreach(elt, var_list)
+ {
+ Form_pg_attribute att_tup;
+ int numattrs = RelationGetNumberOfAttributes(target_relation);
+
+ var = (Var *) lfirst(elt);
+ /* Bypass in case of extra target items like ctid */
+ if (var->varattno < 1 || var->varattno > numattrs)
+ continue;
+
+
+ att_tup = target_relation->rd_att->attrs[var->varattno - 1];
+ tle = makeTargetEntry((Expr *) var,
+ list_length(parsetree->targetList) + 1,
+ pstrdup(NameStr(att_tup->attname)),
+ true);
+
+ parsetree->targetList = lappend(parsetree->targetList, tle);
+ }
+#endif
+
+#ifdef PGXC
+ /*
+ * If relation is non-replicated, we need also to identify the Datanode
+ * from where tuple is fetched.
+ */
+ if (IS_PGXC_COORDINATOR &&
+ !IsConnFromCoord() &&
+ !IsLocatorReplicated(GetRelationLocType(RelationGetRelid(target_relation))) &&
+ (target_relation->rd_rel->relkind == RELKIND_RELATION ||
+ target_relation->rd_rel->relkind == RELKIND_MATVIEW))
+ {
+ var = makeVar(parsetree->resultRelation,
+ XC_NodeIdAttributeNumber,
+ INT4OID,
+ -1,
+ InvalidOid,
+ 0);
+
+ attrname = "xc_node_id";
+
+ tle = makeTargetEntry((Expr *) var,
+ list_length(parsetree->targetList) + 1,
+ pstrdup(attrname),
+ true);
+
+ parsetree->targetList = lappend(parsetree->targetList, tle);
+ }
+#endif
+
if (target_relation->rd_rel->relkind == RELKIND_RELATION ||
- target_relation->rd_rel->relkind == RELKIND_MATVIEW)
+ target_relation->rd_rel->relkind == RELKIND_MATVIEW ||
+ target_relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
{
/*
* Emit CTID so that executor can find the row to update or delete.
return results;
}
- ProcessUtility(cparsetree->utilityStmt, cquery.data, PROCESS_UTILITY_QUERY,
- NULL, NULL, false, NULL);
+
+#ifdef PGXC
+/*
+ * Rewrite the CREATE TABLE AS and SELECT INTO queries as a
+ * INSERT INTO .. SELECT query. The target table must be created first using
+ * utility command processing. This takes care of creating the target table on
+ * all the Coordinators and the Datanodes.
+ */
+List *
+QueryRewriteCTAS(Query *parsetree)
+{
+ RangeVar *relation;
+ CreateStmt *create_stmt;
++ PlannedStmt *wrapper;
+ List *tableElts = NIL;
+ StringInfoData cquery;
+ ListCell *col;
+ Query *cparsetree;
+ List *raw_parsetree_list, *tlist;
+ char *selectstr;
+ CreateTableAsStmt *stmt;
+ IntoClause *into;
+ ListCell *lc;
+
+ if (parsetree->commandType != CMD_UTILITY ||
+ !IsA(parsetree->utilityStmt, CreateTableAsStmt))
+ elog(ERROR, "Unexpected commandType or intoClause is not set properly");
+
+ /* Get the target table */
+ stmt = (CreateTableAsStmt *) parsetree->utilityStmt;
+
+ if (stmt->relkind == OBJECT_MATVIEW)
+ return list_make1(parsetree);
+
+ relation = stmt->into->rel;
+
+ if (stmt->if_not_exists)
+ {
+ Oid nspid;
+
+ nspid = RangeVarGetCreationNamespace(stmt->into->rel);
+
+ if (get_relname_relid(stmt->into->rel->relname, nspid))
+ {
+ ereport(NOTICE,
+ (errcode(ERRCODE_DUPLICATE_TABLE),
+ errmsg("relation \"%s\" already exists, skipping",
+ stmt->into->rel->relname)));
+ return NIL;
+ }
+ }
+
+ /* Start building a CreateStmt for creating the target table */
+ create_stmt = makeNode(CreateStmt);
+ create_stmt->relation = relation;
+ create_stmt->islocal = stmt->islocal;
+ create_stmt->if_not_exists = stmt->if_not_exists;
+ into = stmt->into;
+
+ /* Obtain the target list of new table */
+ Assert(IsA(stmt->query, Query));
+ cparsetree = (Query *) stmt->query;
+ tlist = cparsetree->targetList;
+
+ /*
+ * Based on the targetList, populate the column information for the target
+ * table. If a column name list was specified in CREATE TABLE AS, override
+ * the column names derived from the query. (Too few column names are OK, too
+ * many are not.).
+ */
+ lc = list_head(into->colNames);
+ foreach(col, tlist)
+ {
+ TargetEntry *tle = (TargetEntry *)lfirst(col);
+ ColumnDef *coldef;
+ TypeName *typename;
+
+ /* Ignore junk columns from the targetlist */
+ if (tle->resjunk)
+ continue;
+
+ coldef = makeNode(ColumnDef);
+ typename = makeNode(TypeName);
+
+ /* Take the column name specified if any */
+ if (lc)
+ {
+ coldef->colname = strVal(lfirst(lc));
+ lc = lnext(lc);
+ }
+ else
+ coldef->colname = pstrdup(tle->resname);
+
+ coldef->inhcount = 0;
+ coldef->is_local = true;
+ coldef->is_not_null = false;
+ coldef->raw_default = NULL;
+ coldef->cooked_default = NULL;
+ coldef->constraints = NIL;
+
+ /*
+ * Set typeOid and typemod. The name of the type is derived while
+ * generating query
+ */
+ typename->typeOid = exprType((Node *)tle->expr);
+ typename->typemod = exprTypmod((Node *)tle->expr);
+
+ coldef->typeName = typename;
+
+ tableElts = lappend(tableElts, coldef);
+ }
+
+ if (lc != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("CREATE TABLE AS specifies too many column names")));
+
+ /*
+ * Set column information and the distribution mechanism (which will be
+ * NULL for SELECT INTO and the default mechanism will be picked)
+ */
+ create_stmt->tableElts = tableElts;
+ create_stmt->distributeby = stmt->into->distributeby;
+ create_stmt->subcluster = stmt->into->subcluster;
+
+ create_stmt->tablespacename = stmt->into->tableSpaceName;
+ create_stmt->oncommit = stmt->into->onCommit;
+ create_stmt->options = stmt->into->options;
+
+ /*
+ * Check consistency of arguments
+ */
+ if (create_stmt->oncommit != ONCOMMIT_NOOP
+ && create_stmt->relation->relpersistence != RELPERSISTENCE_TEMP)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("ON COMMIT can only be used on temporary tables")));
+
+ /* Get a copy of the parsetree which we can freely modify */
+ cparsetree = copyObject(parsetree);
+
+ /*
+ * Now build a utility statement in order to run the CREATE TABLE DDL on
+ * the local and remote nodes. We keep others fields as it is since they
+ * are ignored anyways by deparse_query.
+ */
+ cparsetree->commandType = CMD_UTILITY;
+ cparsetree->utilityStmt = (Node *) create_stmt;
+
+ initStringInfo(&cquery);
+ deparse_query(cparsetree, &cquery, NIL, false, false);
+
++
++ /* finally, wrap it in a dummy PlannedStmt */
++ wrapper = makeNode(PlannedStmt);
++ wrapper->commandType = CMD_UTILITY;
++ wrapper->canSetTag = false;
++ wrapper->utilityStmt = (Node *) create_stmt;
++ wrapper->stmt_location = -1;
++ wrapper->stmt_len = -1;
++
+ /* Finally, fire off the query to run the DDL */
- NULL, 0);
++ ProcessUtility(wrapper, cquery.data, PROCESS_UTILITY_QUERY,
++ NULL, NULL, NULL, false, NULL);
+
+ /*
+ * Now fold the CTAS statement into an INSERT INTO statement. The
+ * utility is no more required.
+ */
+ parsetree->utilityStmt = NULL;
+
+ /* Get the SELECT query string */
+ initStringInfo(&cquery);
+ deparse_query((Query *)stmt->query, &cquery, NIL, false, false);
+ selectstr = pstrdup(cquery.data);
+
+ /* Now, finally build the INSERT INTO statement */
+ initStringInfo(&cquery);
+
+ appendStringInfo(&cquery, "INSERT INTO %s.%s",
+ quote_identifier(get_namespace_name(RangeVarGetCreationNamespace(relation))),
+ quote_identifier(relation->relname));
+
+ appendStringInfo(&cquery, " %s %s", selectstr,
+ into->skipData ? "LIMIT 0" : "");
+
+ raw_parsetree_list = pg_parse_query(cquery.data);
+ return pg_analyze_and_rewrite(linitial(raw_parsetree_list), cquery.data,
++ NULL, 0, NULL);
+}
+#endif
* bufmgr.c
* buffer manager interface routines
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* ipci.c
* POSTGRES inter-process communication initialization code.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "storage/procsignal.h"
#include "storage/sinvaladt.h"
#include "storage/spin.h"
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#include "pgxc/squeue.h"
+#include "pgxc/pause.h"
+#endif
+ #include "utils/backend_random.h"
#include "utils/snapmgr.h"
-
shmem_startup_hook_type shmem_startup_hook = NULL;
static Size total_addin_request = 0;
size = add_size(size, ReplicationOriginShmemSize());
size = add_size(size, WalSndShmemSize());
size = add_size(size, WalRcvShmemSize());
+#ifdef XCP
+ if (IS_PGXC_DATANODE)
+ size = add_size(size, SharedQueueShmemSize());
+ if (IS_PGXC_COORDINATOR)
+ size = add_size(size, ClusterLockShmemSize());
+ size = add_size(size, ClusterMonitorShmemSize());
+#endif
+ size = add_size(size, ApplyLauncherShmemSize());
size = add_size(size, SnapMgrShmemSize());
size = add_size(size, BTreeShmemSize());
size = add_size(size, SyncScanShmemSize());
size = add_size(size, AsyncShmemSize());
+#ifdef PGXC
+ size = add_size(size, NodeTablesShmemSize());
+#endif
+
+ size = add_size(size, BackendRandomShmemSize());
#ifdef EXEC_BACKEND
size = add_size(size, ShmemBackendArraySize());
#endif
ReplicationOriginShmemInit();
WalSndShmemInit();
WalRcvShmemInit();
+ ApplyLauncherShmemInit();
+#ifdef XCP
+ /*
+ * Set up distributed executor's shared queues
+ */
+ if (IS_PGXC_DATANODE)
+ SharedQueuesInit();
+ if (IS_PGXC_COORDINATOR)
+ ClusterLockShmemInit();
+ ClusterMonitorShmemInit();
+#endif
+
/*
* Set up other modules that need some shared memory space
*/
BTreeShmemInit();
SyncScanShmemInit();
AsyncShmemInit();
+ BackendRandomShmemInit();
+#ifdef PGXC
+ NodeTablesShmemInit();
+#endif
+
+
#ifdef EXEC_BACKEND
/*
* happen, it would tie up KnownAssignedXids indefinitely, so we protect
* ourselves by pruning the array when a valid list of running XIDs arrives.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "miscadmin.h"
+#include "postmaster/clustermon.h"
+ #include "pgstat.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/spin.h"
* GetOldestXmin() move backwards, with no consequences for data integrity.
*/
TransactionId
- GetOldestXmin(Relation rel, bool ignoreVacuum)
+ GetOldestXmin(Relation rel, int flags)
+{
- return GetOldestXminInternal(rel, ignoreVacuum, false,
++ return GetOldestXminInternal(rel, flags, false,
+ InvalidTransactionId);
+}
+
+/*
+ * This implements most of the logic that GetOldestXmin needs. In XL, we don't
+ * actually compute OldestXmin unless specifically told to do by computeLocal
+ * argument set to true which GetOldestXmin never done. So we just return the
+ * value from the shared memory. The OldestXmin itself is always computed by
+ * the Cluster Monitor process by sending local state information to the GTM,
+ * which then aggregates information from all the nodes and gives out final
+ * OldestXmin or GlobalXmin which is consistent across the entire cluster.
+ *
+ * In addition, Cluster Monitor also passes the last reported xmin (or the one
+ * sent back by GTM in case we were idle) and the last received GlobalXmin. We
+ * must ensure that we don't see an XID or xmin which is beyond these horizons.
+ * Otherwise it signals problems with the GlobalXmin calculation. This can
+ * happen because of network disconnects or extreme load on the machine
+ * (unlikely). In any case, we must restart ourselves to avoid any data
+ * consistency problem. A more careful approach could involve killing only
+ * those backends which are running with old xid or xmin. We can consider
+ * implementing it that way in future
+ */
+TransactionId
- GetOldestXminInternal(Relation rel, bool ignoreVacuum, bool computeLocal,
++GetOldestXminInternal(Relation rel, int flags, bool computeLocal,
+ TransactionId lastGlobalXmin)
{
ProcArrayStruct *arrayP = procArray;
TransactionId result;
* Routines for interprocess signalling
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* lock.c
* POSTGRES primary lock mechanism
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* appropriate value for a free lock. The meaning of the variable is up to
* the caller, the lightweight lock code just assigns and compares it.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
#include "storage/ipc.h"
#include "storage/predicate.h"
#include "storage/proc.h"
+ #include "storage/proclist.h"
#include "storage/spin.h"
+#ifdef XCP
+#include "pgxc/nodemgr.h"
+#include "pgxc/squeue.h"
+#endif
#include "utils/memutils.h"
#ifdef LWLOCK_STATS
if (LWLockTrancheArray == NULL)
{
- LWLockTranchesAllocated = 32;
- LWLockTrancheArray = (LWLockTranche **)
- LWLockTranchesAllocated = 64;
++ LWLockTranchesAllocated = 128; /* XXX PG10MERGE: Not sure why 64 is
++ hardcoded in the PG10 branch. That
++ causes assertion failure */
+ LWLockTrancheArray = (char **)
MemoryContextAllocZero(TopMemoryContext,
- LWLockTranchesAllocated * sizeof(LWLockTranche *));
+ LWLockTranchesAllocated * sizeof(char *));
Assert(LWLockTranchesAllocated >= LWTRANCHE_FIRST_USER_DEFINED);
}
ReplicationOriginLock 40
MultiXactTruncationLock 41
OldSnapshotTimeMapLock 42
-BackendRandomLock 43
-LogicalRepWorkerLock 44
-CLogTruncationLock 45
+BarrierLock 43
+NodeTableLock 44
+SQueuesLock 45
+ClusterMonitorLock 46
++BackendRandomLock 47
++LogicalRepWorkerLock 48
++CLogTruncationLock 49
* proc.c
* routines to manage per-process shared memory data structure
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "access/twophase.h"
#include "access/xact.h"
#include "miscadmin.h"
+ #include "pgstat.h"
#include "postmaster/autovacuum.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "pgxc/poolmgr.h"
+#endif
#include "replication/slot.h"
#include "replication/syncrep.h"
+ #include "storage/condition_variable.h"
#include "storage/standby.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
MyProc->backendId = InvalidBackendId;
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
+#ifdef XCP
+ MyProc->coordId = InvalidOid;
+ MyProc->coordPid = 0;
+#endif
+ MyProc->isBackgroundWorker = IsBackgroundWorker;
MyPgXact->delayChkpt = false;
MyPgXact->vacuumFlags = 0;
+#ifdef PGXC
+ MyProc->isPooler = false;
+#endif
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
if (IsAutoVacuumWorkerProcess())
MyPgXact->vacuumFlags |= PROC_IS_AUTOVACUUM;
MyProc->backendId = InvalidBackendId;
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
+#ifdef XCP
+ MyProc->coordId = InvalidOid;
+ MyProc->coordPid = 0;
+#endif
+#ifdef PGXC
+ MyProc->isPooler = false;
+ if (IsPGXCPoolerProcess())
+ MyProc->isPooler = true;
+#endif
+ MyProc->isBackgroundWorker = IsBackgroundWorker;
MyPgXact->delayChkpt = false;
MyPgXact->vacuumFlags = 0;
MyProc->lwWaiting = false;
* support for communication destinations
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* postgres.c
* POSTGRES C Backend Interface
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
/*
* Run through the raw parsetree(s) and process each one.
*/
- foreach(parsetree_item, parsetree_list)
+ forboth(parsetree_item, parsetree_list, querysource_item, querysource_list)
{
- Node *parsetree = (Node *) lfirst(parsetree_item);
+ RawStmt *parsetree = lfirst_node(RawStmt, parsetree_item);
+ char *querysource = (char *) lfirst(querysource_item);
bool snapshot_set = false;
const char *commandTag;
char completionTag[COMPLETION_TAG_BUFSIZE];
appendStringInfoString(&str, "! system usage stats:\n");
appendStringInfo(&str,
- "!\t%ld.%06ld elapsed %ld.%06ld user %ld.%06ld system sec\n",
- (long) (elapse_t.tv_sec - save_t->tv_sec),
- (long) (elapse_t.tv_usec - save_t->tv_usec),
+ "!\t%ld.%06ld s user, %ld.%06ld s system, %ld.%06ld s elapsed\n",
- (long) (r.ru_utime.tv_sec - Save_r.ru_utime.tv_sec),
- (long) (r.ru_utime.tv_usec - Save_r.ru_utime.tv_usec),
- (long) (r.ru_stime.tv_sec - Save_r.ru_stime.tv_sec),
- (long) (r.ru_stime.tv_usec - Save_r.ru_stime.tv_usec),
- (long) (elapse_t.tv_sec - Save_t.tv_sec),
- (long) (elapse_t.tv_usec - Save_t.tv_usec));
+ (long) (r.ru_utime.tv_sec - save_r->ru_utime.tv_sec),
+ (long) (r.ru_utime.tv_usec - save_r->ru_utime.tv_usec),
+ (long) (r.ru_stime.tv_sec - save_r->ru_stime.tv_sec),
- (long) (r.ru_stime.tv_usec - save_r->ru_stime.tv_usec));
++ (long) (r.ru_stime.tv_usec - save_r->ru_stime.tv_usec),
++ (long) (elapse_t.tv_sec - save_t->tv_sec),
++ (long) (elapse_t.tv_usec - save_t->tv_usec));
appendStringInfo(&str,
- "!\t[%ld.%06ld user %ld.%06ld sys total]\n",
+ "!\t[%ld.%06ld s user, %ld.%06ld s system total]\n",
(long) user.tv_sec,
(long) user.tv_usec,
(long) sys.tv_sec,
* pquery.c
* POSTGRES process query command code
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
qd->planstate = NULL;
qd->totaltime = NULL;
- return qd;
- }
-
- /*
- * CreateUtilityQueryDesc
- */
- QueryDesc *
- CreateUtilityQueryDesc(Node *utilitystmt,
- const char *sourceText,
- Snapshot snapshot,
- DestReceiver *dest,
- ParamListInfo params)
- {
- QueryDesc *qd = (QueryDesc *) palloc(sizeof(QueryDesc));
-
- qd->operation = CMD_UTILITY; /* operation */
- qd->plannedstmt = NULL;
- qd->utilitystmt = utilitystmt; /* utility command */
- qd->sourceText = sourceText; /* query text */
- qd->snapshot = RegisterSnapshot(snapshot); /* snapshot */
- qd->crosscheck_snapshot = InvalidSnapshot; /* RI check snapshot */
- qd->dest = dest; /* output dest */
- qd->params = params; /* parameter values passed into query */
- qd->instrument_options = false; /* uninteresting for utilities */
-
- /* null these fields until set by ExecutorStart */
- qd->tupDesc = NULL;
- qd->estate = NULL;
- qd->planstate = NULL;
- qd->totaltime = NULL;
+#ifdef XCP
+ qd->squeue = NULL;
+ qd->myindex = -1;
+#endif
+
+ /* not yet executed */
+ qd->already_executed = false;
return qd;
}
{
PlannedStmt *pstmt = (PlannedStmt *) stmt;
+#ifdef XCP
+ if (list_length(pstmt->distributionRestrict) > 1)
+ return PORTAL_DISTRIBUTED;
+#endif
+
if (pstmt->canSetTag)
{
- if (pstmt->commandType == CMD_SELECT &&
- pstmt->utilityStmt == NULL)
+ if (pstmt->commandType == CMD_SELECT)
{
if (pstmt->hasModifyingCTE)
return PORTAL_ONE_MOD_WITH;
*/
switch (portal->strategy)
{
+#ifdef XCP
+ case PORTAL_DISTRIBUTED:
+ /* No special ability is needed */
+ eflags = 0;
+ /* Must set snapshot before starting executor. */
+ if (snapshot)
+ PushActiveSnapshot(GetActiveSnapshot());
+ else
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ /*
+ * Create QueryDesc in portal's context; for the moment, set
+ * the destination to DestNone.
+ */
+ queryDesc = CreateQueryDesc((PlannedStmt *) linitial(portal->stmts),
+ portal->sourceText,
+ GetActiveSnapshot(),
+ InvalidSnapshot,
+ None_Receiver,
+ params,
++ NULL,
+ 0);
+ /*
+ * If parent node have sent down parameters, and at least one
+ * of them is PARAM_EXEC we should avoid "single execution"
+ * model. All parent nodes deliver the same values for
+ * PARAM_EXTERN since these values are provided by client and
+ * they are not changed during the query execution.
+ * On the conrary, values of PARAM_EXEC are results of execution
+ * on the parent node and in general diferent parents send to
+ * this node different values and executions are not equivalent.
+ * Since PARAM_EXECs are always at the end of the list we just
+ * need to check last item to figure out if there are any
+ * PARAM_EXECs.
+ * NB: Check queryDesc->plannedstmt->nParamExec > 0 is incorrect
+ * here since queryDesc->plannedstmt->nParamExec may be used
+ * just to allocate space for them and no actual values passed.
+ */
+ if (queryDesc->plannedstmt->nParamRemote > 0 &&
+ queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
+ {
+ int *consMap;
+ int len;
+ ListCell *lc;
+ int i;
+ Locator *locator;
+ Oid keytype;
+ DestReceiver *dest;
+
+ len = list_length(queryDesc->plannedstmt->distributionNodes);
+ consMap = (int *) palloc0(len * sizeof(int));
+ queryDesc->squeue = NULL;
+ queryDesc->myindex = -1;
+ PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
+ &PGXC_PARENT_NODE_TYPE);
+ i = 0;
+ foreach(lc, queryDesc->plannedstmt->distributionNodes)
+ {
+ if (PGXC_PARENT_NODE_ID == lfirst_int(lc))
+ consMap[i] = SQ_CONS_SELF;
+ else
+ consMap[i] = SQ_CONS_NONE;
+ i++;
+ }
+ /*
+ * Multiple executions of the RemoteSubplan may lead to name
+ * conflict of SharedQueue, if the subplan has more
+ * RemoteSubplan nodes in the execution plan tree.
+ * We need to make them unique.
+ */
+ RemoteSubplanMakeUnique(
+ (Node *) queryDesc->plannedstmt->planTree,
+ PGXC_PARENT_NODE_ID);
+ /*
+ * Call ExecutorStart to prepare the plan for execution
+ */
+ ExecutorStart(queryDesc, eflags);
+
+ /*
+ * Set up locator if result distribution is requested
+ */
+ keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
+ InvalidOid :
+ queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
+ locator = createLocator(
+ queryDesc->plannedstmt->distributionType,
+ RELATION_ACCESS_INSERT,
+ keytype,
+ LOCATOR_LIST_INT,
+ len,
+ consMap,
+ NULL,
+ false);
+ dest = CreateDestReceiver(DestProducer);
+ SetProducerDestReceiverParams(dest,
+ queryDesc->plannedstmt->distributionKey,
+ locator, queryDesc->squeue);
+ queryDesc->dest = dest;
+ }
+ else
+ {
+ int *consMap;
+ int len;
+
+ /* Distributed data requested, bind shared queue for data exchange */
+ len = list_length(queryDesc->plannedstmt->distributionNodes);
+ consMap = (int *) palloc(len * sizeof(int));
+ queryDesc->squeue = SharedQueueBind(portal->name,
+ queryDesc->plannedstmt->distributionRestrict,
+ queryDesc->plannedstmt->distributionNodes,
+ &queryDesc->myindex, consMap);
+ if (queryDesc->myindex == -1)
+ {
+ /* producer */
+ Locator *locator;
+ Oid keytype;
+ DestReceiver *dest;
+
+ PG_TRY();
+ {
+ /*
+ * Call ExecutorStart to prepare the plan for execution
+ */
+ ExecutorStart(queryDesc, eflags);
+ }
+ PG_CATCH();
+ {
+ /* Ensure SharedQueue is released */
+ SharedQueueUnBind(queryDesc->squeue, true);
+ queryDesc->squeue = NULL;
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ /*
+ * This tells PortalCleanup to shut down the executor
+ */
+ portal->queryDesc = queryDesc;
+
+ /*
+ * Some basic sanity checking against invalid remote plans.
+ */
+ Assert((queryDesc->plannedstmt->distributionKey == InvalidAttrNumber) ||
+ (queryDesc->plannedstmt->distributionKey <= queryDesc->tupDesc->natts));
+
+ /*
+ * Set up locator if result distribution is requested
+ */
+ keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
+ InvalidOid :
+ queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
+ locator = createLocator(
+ queryDesc->plannedstmt->distributionType,
+ RELATION_ACCESS_INSERT,
+ keytype,
+ LOCATOR_LIST_INT,
+ len,
+ consMap,
+ NULL,
+ false);
+ dest = CreateDestReceiver(DestProducer);
+ SetProducerDestReceiverParams(dest,
+ queryDesc->plannedstmt->distributionKey,
+ locator, queryDesc->squeue);
+ queryDesc->dest = dest;
+
+ addProducingPortal(portal);
+ }
+ else
+ {
+ /*
+ * We do not need to initialize executor, but need
+ * a tuple descriptor
+ */
+ queryDesc->tupDesc = ExecCleanTypeFromTL(
+ queryDesc->plannedstmt->planTree->targetlist,
+ false);
+ }
+ pfree(consMap);
+ }
+
+ portal->queryDesc = queryDesc;
+
+ /*
+ * Remember tuple descriptor (computed by ExecutorStart)
+ */
+ portal->tupDesc = queryDesc->tupDesc;
+
+ /*
+ * Reset cursor position data to "start of query"
+ */
+ portal->atStart = true;
+ portal->atEnd = false; /* allow fetches */
+ portal->portalPos = 0;
+
+ PopActiveSnapshot();
+ break;
+#endif
+
case PORTAL_ONE_SELECT:
/* Must set snapshot before starting executor. */
portal->sourceText,
isTopLevel ? PROCESS_UTILITY_TOPLEVEL : PROCESS_UTILITY_QUERY,
portal->portalParams,
+ portal->queryEnv,
dest,
+#ifdef PGXC
+ false,
+#endif /* PGXC */
completionTag);
/* Some utility statements may change context on us */
ProcessQuery(pstmt,
portal->sourceText,
portal->portalParams,
+ portal->queryEnv,
dest, completionTag);
+#ifdef PGXC
+ /* it's special for INSERT */
+ if (IS_PGXC_COORDINATOR &&
+ pstmt->commandType == CMD_INSERT)
+ HandleCmdComplete(pstmt->commandType, &combine,
+ completionTag, strlen(completionTag));
+#endif
}
else
{
portal->atEnd = false;
portal->portalPos = 0;
}
- ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES);
+
+#ifdef XCP
+/*
+ * Execute the specified portal's query and distribute tuples to consumers.
+ * Returs 1 if portal should keep producing, 0 if all consumers have enough
+ * rows in the buffers to pause producing temporarily, -1 if the query is
+ * completed.
+ */
+int
+AdvanceProducingPortal(Portal portal, bool can_wait)
+{
+ Portal saveActivePortal;
+ ResourceOwner saveResourceOwner;
+ MemoryContext savePortalContext;
+ MemoryContext oldContext;
+ QueryDesc *queryDesc;
+ SharedQueue squeue;
+ DestReceiver *treceiver;
+ int result;
+
+ queryDesc = PortalGetQueryDesc(portal);
+ squeue = queryDesc->squeue;
+
+ Assert(queryDesc);
+ /* Make sure the portal is producing */
+ Assert(squeue && queryDesc->myindex == -1);
+ /* Make sure there is proper receiver */
+ Assert(queryDesc->dest && queryDesc->dest->mydest == DestProducer);
+
+ /*
+ * Set up global portal context pointers.
+ */
+ saveActivePortal = ActivePortal;
+ saveResourceOwner = CurrentResourceOwner;
+ savePortalContext = PortalContext;
+ PG_TRY();
+ {
+ ActivePortal = portal;
+ CurrentResourceOwner = portal->resowner;
+ PortalContext = PortalGetHeapMemory(portal);
+
+ oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal));
+
+ /*
+ * That is the first pass thru if the hold store is not initialized yet,
+ * Need to initialize stuff.
+ */
+ if (portal->holdStore == NULL && portal->status != PORTAL_FAILED)
+ {
+ int idx;
+ char storename[64];
+
+ PortalCreateProducerStore(portal);
+ treceiver = CreateDestReceiver(DestTuplestore);
+ SetTuplestoreDestReceiverParams(treceiver,
+ portal->holdStore,
+ portal->holdContext,
+ false);
+ SetSelfConsumerDestReceiver(queryDesc->dest, treceiver);
+ SetProducerTempMemory(queryDesc->dest, portal->tmpContext);
+ snprintf(storename, 64, "%s producer store", portal->name);
+ tuplestore_collect_stat(portal->holdStore, storename);
+ /*
+ * Tuplestore does not clear eof flag on the active read pointer,
+ * causing the store is always in EOF state once reached when
+ * there is a single read pointer. We do not want behavior like this
+ * and workaround by using secondary read pointer.
+ * Primary read pointer (0) is active when we are writing to
+ * the tuple store, secondary read pointer is for reading, and its
+ * eof flag is cleared if a tuple is written to the store.
+ * We know the extra read pointer has index 1, so do not store it.
+ */
+ idx = tuplestore_alloc_read_pointer(portal->holdStore, 0);
+ Assert(idx == 1);
+ }
+
+ if (queryDesc->estate && !queryDesc->estate->es_finished &&
+ portal->status != PORTAL_FAILED)
+ {
+ /*
+ * If the portal's hold store has tuples available for read and
+ * all consumer queues are not empty we skip advancing the portal
+ * (pause it) to prevent buffering too many rows at the producer.
+ * NB just created portal store would not be in EOF state, but in
+ * this case consumer queues will be empty and do not allow
+ * erroneous pause. After the first call to AdvanceProducingPortal
+ * portal will try to read the hold store and EOF flag will be set
+ * correctly.
+ */
+ tuplestore_select_read_pointer(portal->holdStore, 1);
+ if (!tuplestore_ateof(portal->holdStore) &&
+ SharedQueueCanPause(squeue))
+ result = 0;
+ else
+ result = 1;
+ tuplestore_select_read_pointer(portal->holdStore, 0);
+
+ if (result)
+ {
+ /* Execute query and dispatch tuples via dest receiver */
+#define PRODUCE_TUPLES 100
+ PushActiveSnapshot(queryDesc->snapshot);
++ ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES, true);
+ PopActiveSnapshot();
+
+ if (queryDesc->estate->es_processed < PRODUCE_TUPLES)
+ {
+ /*
+ * Finish the executor, but we may still have some tuples
+ * in the local storages.
+ * We should keep trying pushing them into the squeue, so do not
+ * remove the portal from the list of producers.
+ */
+ ExecutorFinish(queryDesc);
+ }
+ }
+ }
+
+ /* Try to dump local tuplestores */
+ if ((queryDesc->estate == NULL || queryDesc->estate->es_finished) &&
+ ProducerReceiverPushBuffers(queryDesc->dest))
+ {
+ if (can_wait && queryDesc->estate == NULL)
+ {
+ (*queryDesc->dest->rDestroy) (queryDesc->dest);
+ queryDesc->dest = NULL;
+ portal->queryDesc = NULL;
+ squeue = NULL;
+
+ removeProducingPortal(portal);
+ FreeQueryDesc(queryDesc);
+
+ /*
+ * Current context is the portal context, which is going
+ * to be deleted
+ */
+ MemoryContextSwitchTo(TopTransactionContext);
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ if (portal->resowner)
+ {
+ bool isCommit = (portal->status != PORTAL_FAILED);
+
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ isCommit, false);
+ ResourceOwnerDelete(portal->resowner);
+ }
+ portal->resowner = NULL;
+
+ /*
+ * Delete tuplestore if present. We should do this even under error
+ * conditions; since the tuplestore would have been using cross-
+ * transaction storage, its temp files need to be explicitly deleted.
+ */
+ if (portal->holdStore)
+ {
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(portal->holdContext);
+ tuplestore_end(portal->holdStore);
+ MemoryContextSwitchTo(oldcontext);
+ portal->holdStore = NULL;
+ }
+
+ /* delete tuplestore storage, if any */
+ if (portal->holdContext)
+ MemoryContextDelete(portal->holdContext);
+
+ /* release subsidiary storage */
+ MemoryContextDelete(PortalGetHeapMemory(portal));
+
+ /* release portal struct (it's in PortalMemory) */
+ pfree(portal);
+ }
+ /* report portal is not producing */
+ result = -1;
+ }
+ else
+ {
+ result = SharedQueueCanPause(queryDesc->squeue) ? 0 : 1;
+ }
+ }
+ PG_CATCH();
+ {
+ /* Uncaught error while executing portal: mark it dead */
+ portal->status = PORTAL_FAILED;
+ /*
+ * Reset producer to allow consumers to finish, so receiving node will
+ * handle the error.
+ */
+ if (squeue)
+ SharedQueueReset(squeue, -1);
+
+ /* Restore global vars and propagate error */
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ MemoryContextSwitchTo(oldContext);
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ return result;
+}
+
+
+/*
+ * Iterate over producing portal, determine already closed, and clean them up,
+ * waiting while consumers finish their work. Closed producers should be
+ * cleaned up and resources are released before proceeding with handling of
+ * next request.
+ */
+void
+cleanupClosedProducers(void)
+{
+ ListCell *lc = list_head(getProducingPortals());
+ while (lc)
+ {
+ Portal p = (Portal) lfirst(lc);
+ QueryDesc *queryDesc = PortalGetQueryDesc(p);
+ SharedQueue squeue = queryDesc->squeue;
+
+ /*
+ * Get next already, because next call may remove cell from
+ * the list and invalidate next reference
+ */
+ lc = lnext(lc);
+
+ /* When portal is closed executor state is not set */
+ if (queryDesc->estate == NULL)
+ {
+ /*
+ * Set up global portal context pointers.
+ */
+ Portal saveActivePortal = ActivePortal;
+ ResourceOwner saveResourceOwner = CurrentResourceOwner;
+ MemoryContext savePortalContext = PortalContext;
+
+ PG_TRY();
+ {
+ MemoryContext oldContext;
+ ActivePortal = p;
+ CurrentResourceOwner = p->resowner;
+ PortalContext = PortalGetHeapMemory(p);
+
+ oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(p));
+
+ (*queryDesc->dest->rDestroy) (queryDesc->dest);
+ queryDesc->dest = NULL;
+ p->queryDesc = NULL;
+ squeue = NULL;
+
+ removeProducingPortal(p);
+ FreeQueryDesc(queryDesc);
+
+ /*
+ * Current context is the portal context, which is going
+ * to be deleted
+ */
+ MemoryContextSwitchTo(TopTransactionContext);
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ if (p->resowner)
+ {
+ bool isCommit = (p->status != PORTAL_FAILED);
+
+ ResourceOwnerRelease(p->resowner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(p->resowner,
+ RESOURCE_RELEASE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(p->resowner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ isCommit, false);
+ ResourceOwnerDelete(p->resowner);
+ }
+ p->resowner = NULL;
+
+ /*
+ * Delete tuplestore if present. We should do this even under error
+ * conditions; since the tuplestore would have been using cross-
+ * transaction storage, its temp files need to be explicitly deleted.
+ */
+ if (p->holdStore)
+ {
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(p->holdContext);
+ tuplestore_end(p->holdStore);
+ MemoryContextSwitchTo(oldcontext);
+ p->holdStore = NULL;
+ }
+
+ /* delete tuplestore storage, if any */
+ if (p->holdContext)
+ MemoryContextDelete(p->holdContext);
+
+ /* release subsidiary storage */
+ MemoryContextDelete(PortalGetHeapMemory(p));
+
+ /* release portal struct (it's in PortalMemory) */
+ pfree(p);
+
+ MemoryContextSwitchTo(oldContext);
+ }
+ PG_CATCH();
+ {
+ /* Uncaught error while executing portal: mark it dead */
+ p->status = PORTAL_FAILED;
+ /*
+ * Reset producer to allow consumers to finish, so receiving node will
+ * handle the error.
+ */
+ if (squeue)
+ SharedQueueReset(squeue, -1);
+
+ /* Restore global vars and propagate error */
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+ }
+ }
+}
+#endif
* commands. At one time acted as an interface between the Lisp and C
* systems.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
const char *queryString,
ProcessUtilityContext context,
ParamListInfo params,
+ QueryEnvironment *queryEnv,
DestReceiver *dest,
+ bool sentToRemote,
char *completionTag);
+
+#ifdef PGXC
+static void ExecDropStmt(DropStmt *stmt,
+ const char *queryString,
+ bool sentToRemote,
+ bool isTopLevel);
+#else
static void ExecDropStmt(DropStmt *stmt, bool isTopLevel);
+#endif
/*
const char *queryString,
ProcessUtilityContext context,
ParamListInfo params,
+ QueryEnvironment *queryEnv,
DestReceiver *dest,
+#ifdef PGXC
+ bool sentToRemote,
+#endif
char *completionTag)
{
+ Assert(IsA(pstmt, PlannedStmt));
+ Assert(pstmt->commandType == CMD_UTILITY);
Assert(queryString != NULL); /* required as of 8.4 */
/*
* call standard_ProcessUtility().
*/
if (ProcessUtility_hook)
- (*ProcessUtility_hook) (parsetree, queryString,
- context, params,
+ (*ProcessUtility_hook) (pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif
+ completionTag);
else
- standard_ProcessUtility(parsetree, queryString,
- context, params,
+ standard_ProcessUtility(pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
- #ifdef PGXC
+ sentToRemote,
- #endif
+ completionTag);
}
/*
const char *queryString,
ProcessUtilityContext context,
ParamListInfo params,
+ QueryEnvironment *queryEnv,
DestReceiver *dest,
+#ifdef PGXC
+ bool sentToRemote,
+#endif
char *completionTag)
{
+ Node *parsetree = pstmt->utilityStmt;
bool isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL);
+ ParseState *pstate;
+#ifdef PGXC
+ /*
+ * For more detail see comments in function pgxc_lock_for_backup.
+ *
+ * Cosider the following scenario:
+ * Imagine a two cordinator cluster CO1, CO2
+ * Suppose a client connected to CO1 issues select pgxc_lock_for_backup()
+ * Now assume that a client connected to CO2 issues a create table
+ * select pgxc_lock_for_backup() would try to acquire the advisory lock
+ * in exclusive mode, whereas create table would try to acquire the same
+ * lock in shared mode. Both these requests will always try acquire the
+ * lock in the same order i.e. they would both direct the request first to
+ * CO1 and then to CO2. One of the two requests would therefore pass
+ * and the other would fail.
+ *
+ * Consider another scenario:
+ * Suppose we have a two cooridnator cluster CO1 and CO2
+ * Assume one client connected to each coordinator
+ * Further assume one client starts a transaction
+ * and issues a DDL. This is an unfinished transaction.
+ * Now assume the second client issues
+ * select pgxc_lock_for_backup()
+ * This request would fail because the unfinished transaction
+ * would already hold the advisory lock.
+ */
+ if (IS_PGXC_LOCAL_COORDINATOR && IsNormalProcessingMode())
+ {
+ /* Is the statement a prohibited one? */
+ if (!IsStmtAllowedInLockedMode(parsetree, queryString))
+ pgxc_lock_for_utility_stmt(parsetree);
+ }
+#endif
check_xact_readonly(parsetree);
case T_CreatedbStmt:
/* no event triggers for global objects */
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+#endif
PreventTransactionChain(isTopLevel, "CREATE DATABASE");
- createdb((CreatedbStmt *) parsetree);
+ createdb(pstate, (CreatedbStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityWithMessage(queryString, sentToRemote, false);
+#endif
break;
case T_AlterDatabaseStmt:
/* no event triggers for global objects */
- AlterDatabase((AlterDatabaseStmt *) parsetree, isTopLevel);
+ AlterDatabase(pstate, (AlterDatabaseStmt *) parsetree, isTopLevel);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ /*
+ * If this is not a SET TABLESPACE statement, just propogate the
+ * cmd as usual.
+ */
+ if (!IsSetTableSpace((AlterDatabaseStmt*) parsetree))
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+ else
+ ExecUtilityWithMessage(queryString, sentToRemote, false);
+ }
+#endif
break;
case T_AlterDatabaseSetStmt:
*/
case T_CreateRoleStmt:
/* no event triggers for global objects */
- CreateRole((CreateRoleStmt *) parsetree);
+ CreateRole(pstate, (CreateRoleStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
break;
case T_AlterRoleStmt:
GrantStmt *stmt = (GrantStmt *) parsetree;
if (EventTriggerSupportsGrantObjectType(stmt->objtype))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
- ExecuteGrantStmt((GrantStmt *) parsetree);
+ ExecuteGrantStmt(stmt);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ RemoteQueryExecType remoteExecType = EXEC_ON_ALL_NODES;
+ GrantStmt *stmt = (GrantStmt *) parsetree;
+ bool is_temp = false;
+
+ /* Launch GRANT on Coordinator if object is a sequence */
+ if ((stmt->objtype == ACL_OBJECT_RELATION &&
+ stmt->targtype == ACL_TARGET_OBJECT))
+ {
+ /*
+ * In case object is a relation, differenciate the case
+ * of a sequence, a view and a table
+ */
+ ListCell *cell;
+ /* Check the list of objects */
+ bool first = true;
+ RemoteQueryExecType type_local = remoteExecType;
+
+ foreach (cell, stmt->objects)
+ {
+ RangeVar *relvar = (RangeVar *) lfirst(cell);
+ Oid relid = RangeVarGetRelid(relvar, NoLock, true);
+
+ /* Skip if object does not exist */
+ if (!OidIsValid(relid))
+ continue;
+
+ remoteExecType = ExecUtilityFindNodesRelkind(relid, &is_temp);
+
+ /* Check if object node type corresponds to the first one */
+ if (first)
+ {
+ type_local = remoteExecType;
+ first = false;
+ }
+ else
+ {
+ if (type_local != remoteExecType)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("PGXC does not support GRANT on multiple object types"),
+ errdetail("Grant VIEW/TABLE with separate queries")));
+ }
+ }
+ }
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, remoteExecType, is_temp);
+ }
+#endif
}
break;
DropStmt *stmt = (DropStmt *) parsetree;
if (EventTriggerSupportsObjectType(stmt->removeType))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
- ExecDropStmt(stmt, isTopLevel);
+ ExecDropStmt(stmt, queryString, sentToRemote, isTopLevel);
}
break;
RenameStmt *stmt = (RenameStmt *) parsetree;
if (EventTriggerSupportsObjectType(stmt->renameType))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
ExecRenameStmt(stmt);
}
AlterObjectDependsStmt *stmt = (AlterObjectDependsStmt *) parsetree;
if (EventTriggerSupportsObjectType(stmt->objectType))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
ExecAlterObjectDependsStmt(stmt, NULL);
}
AlterObjectSchemaStmt *stmt = (AlterObjectSchemaStmt *) parsetree;
if (EventTriggerSupportsObjectType(stmt->objectType))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
ExecAlterObjectSchemaStmt(stmt, NULL);
}
AlterOwnerStmt *stmt = (AlterOwnerStmt *) parsetree;
if (EventTriggerSupportsObjectType(stmt->objectType))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
ExecAlterOwnerStmt(stmt);
+
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
}
break;
CommentStmt *stmt = (CommentStmt *) parsetree;
if (EventTriggerSupportsObjectType(stmt->objtype))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
- CommentObject((CommentStmt *) parsetree);
+ CommentObject(stmt);
+ break;
}
+#ifdef PGXC
+ {
+ /* Comment objects depending on their object and temporary types */
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ bool is_temp = false;
+ CommentStmt *stmt = (CommentStmt *) parsetree;
+ RemoteQueryExecType exec_type = GetNodesForCommentUtility(stmt, &is_temp);
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp);
+ }
+ }
+#endif
+ break;
case T_SecLabelStmt:
{
SecLabelStmt *stmt = (SecLabelStmt *) parsetree;
if (EventTriggerSupportsObjectType(stmt->objtype))
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
else
ExecSecLabelStmt(stmt);
break;
default:
/* All other statement types have event trigger support */
- ProcessUtilitySlow(parsetree, queryString,
- context, params,
+ ProcessUtilitySlow(pstate, pstmt, queryString,
+ context, params, queryEnv,
- dest, completionTag);
+ dest,
+ sentToRemote,
+ completionTag);
break;
}
+
+ free_parsestate(pstate);
}
/*
const char *queryString,
ProcessUtilityContext context,
ParamListInfo params,
+ QueryEnvironment *queryEnv,
DestReceiver *dest,
+ bool sentToRemote,
char *completionTag)
{
+ Node *parsetree = pstmt->utilityStmt;
bool isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL);
bool isCompleteQuery = (context <= PROCESS_UTILITY_QUERY);
bool needCleanup;
* relation and attribute manipulation
*/
case T_CreateSchemaStmt:
- #ifdef PGXC
- CreateSchemaCommand((CreateSchemaStmt *) parsetree,
- queryString, sentToRemote);
- #else
CreateSchemaCommand((CreateSchemaStmt *) parsetree,
- queryString);
- #endif
- queryString,
++ queryString, sentToRemote,
+ pstmt->stmt_location,
+ pstmt->stmt_len);
/*
* EventTriggerCollectSimpleCommand called by
queryString,
PROCESS_UTILITY_SUBCOMMAND,
params,
+ NULL,
None_Receiver,
+#ifdef PGXC
+ true,
+#endif
NULL);
}
queryString,
PROCESS_UTILITY_SUBCOMMAND,
params,
+ NULL,
None_Receiver,
+#ifdef PGXC
+ true,
+#endif /* PGXC */
NULL);
EventTriggerAlterTableStart(parsetree);
EventTriggerAlterTableRelid(relid);
break;
case T_CreateExtensionStmt:
- address = CreateExtension((CreateExtensionStmt *) parsetree);
+ address = CreateExtension(pstate, (CreateExtensionStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
break;
case T_AlterExtensionStmt:
- address = ExecAlterExtensionStmt((AlterExtensionStmt *) parsetree);
+ address = ExecAlterExtensionStmt(pstate, (AlterExtensionStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
break;
case T_AlterExtensionContentsStmt:
break;
case T_AlterEnumStmt: /* ALTER TYPE (enum) */
- address = AlterEnum((AlterEnumStmt *) parsetree, isTopLevel);
+ address = AlterEnum((AlterEnumStmt *) parsetree);
+#ifdef PGXC
+ /*
+ * In this case force autocommit, this transaction cannot be launched
+ * inside a transaction block.
+ */
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote,
+ true, EXEC_ON_ALL_NODES, false);
+#endif
break;
case T_ViewStmt: /* CREATE VIEW */
EventTriggerAlterTableStart(parsetree);
- address = DefineView((ViewStmt *) parsetree, queryString);
+ address = DefineView((ViewStmt *) parsetree, queryString,
+ pstmt->stmt_location, pstmt->stmt_len);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ ViewStmt *stmt = (ViewStmt *) parsetree;
+
+ if (stmt->view->relpersistence != RELPERSISTENCE_TEMP)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false);
+ }
+#endif
EventTriggerCollectSimpleCommand(address, secondaryObject,
parsetree);
/* stashed internally */
break;
case T_CreateFunctionStmt: /* CREATE FUNCTION */
- address = CreateFunction((CreateFunctionStmt *) parsetree, queryString);
+ address = CreateFunction(pstate, (CreateFunctionStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
break;
case T_AlterFunctionStmt: /* ALTER FUNCTION */
- address = AlterFunction((AlterFunctionStmt *) parsetree);
+ address = AlterFunction(pstate, (AlterFunctionStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
break;
case T_RuleStmt: /* CREATE RULE */
break;
case T_CreateSeqStmt:
- address = DefineSequence((CreateSeqStmt *) parsetree);
+ address = DefineSequence(pstate, (CreateSeqStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree;
+
+ /* In case this query is related to a SERIAL execution, just bypass */
+ if (!stmt->is_serial)
+ {
+ bool is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP;
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp);
+ }
+ }
+#endif
break;
case T_AlterSeqStmt:
- address = AlterSequence((AlterSeqStmt *) parsetree);
+ address = AlterSequence(pstate, (AlterSeqStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ {
+ AlterSeqStmt *stmt = (AlterSeqStmt *) parsetree;
+
+ /* In case this query is related to a SERIAL execution, just bypass */
+ if (!stmt->is_serial)
+ {
+ bool is_temp;
+ RemoteQueryExecType exec_type;
+ Oid relid = RangeVarGetRelid(stmt->sequence, NoLock, true);
+
+ if (!OidIsValid(relid))
+ break;
+
+ exec_type = ExecUtilityFindNodes(OBJECT_SEQUENCE,
+ relid,
+ &is_temp);
+
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp);
+ }
+ }
+#endif
break;
case T_CreateTableAsStmt:
address = ExecCreateTableAs((CreateTableAsStmt *) parsetree,
- queryString, params, completionTag);
+ queryString, params, queryEnv,
+ completionTag);
+#ifdef PGXC
+ if ((IS_PGXC_COORDINATOR) && !IsConnFromCoord())
+ {
+ CreateTableAsStmt *stmt = (CreateTableAsStmt *) parsetree;
+
+ /*
+ * CTAS for normal tables should have been rewritten as a
+ * CREATE TABLE + SELECT INTO
+ */
+ Assert(stmt->relkind == OBJECT_MATVIEW);
+ if (stmt->into->rel->relpersistence != RELPERSISTENCE_TEMP)
+ ExecUtilityStmtOnNodes(queryString, NULL,
+ sentToRemote, false, EXEC_ON_COORDS, false);
+ }
+#endif
break;
case T_RefreshMatViewStmt:
break;
case T_AlterTSConfigurationStmt:
- address = AlterTSConfiguration((AlterTSConfigurationStmt *) parsetree);
+ AlterTSConfiguration((AlterTSConfigurationStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
+
+ /*
+ * Commands are stashed in MakeConfigurationMapping and
+ * DropConfigurationMapping, which are called from
+ * AlterTSConfiguration
+ */
+ commandCollected = true;
break;
case T_AlterTableMoveAllStmt:
break;
case T_AlterDefaultPrivilegesStmt:
- ExecAlterDefaultPrivilegesStmt((AlterDefaultPrivilegesStmt *) parsetree);
+ ExecAlterDefaultPrivilegesStmt(pstate, (AlterDefaultPrivilegesStmt *) parsetree);
+
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
EventTriggerCollectAlterDefPrivs((AlterDefaultPrivilegesStmt *) parsetree);
commandCollected = true;
break;
case T_CreateAmStmt:
address = CreateAccessMethod((CreateAmStmt *) parsetree);
+#ifdef PGXC
+ if (IS_PGXC_LOCAL_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
break;
+ case T_CreatePublicationStmt:
+ address = CreatePublication((CreatePublicationStmt *) parsetree);
+ break;
+
+ case T_AlterPublicationStmt:
+ AlterPublication((AlterPublicationStmt *) parsetree);
+
+ /*
+ * AlterPublication calls EventTriggerCollectSimpleCommand
+ * directly
+ */
+ commandCollected = true;
+ break;
+
+ case T_CreateSubscriptionStmt:
+ address = CreateSubscription((CreateSubscriptionStmt *) parsetree,
+ isTopLevel);
+ break;
+
+ case T_AlterSubscriptionStmt:
+ address = AlterSubscription((AlterSubscriptionStmt *) parsetree);
+ break;
+
+ case T_DropSubscriptionStmt:
+ DropSubscription((DropSubscriptionStmt *) parsetree, isTopLevel);
+ /* no commands stashed for DROP */
+ commandCollected = true;
+ break;
+
+ case T_CreateStatsStmt:
+ address = CreateStatistics((CreateStatsStmt *) parsetree);
+ break;
+
+ case T_AlterCollationStmt:
+ address = AlterCollation((AlterCollationStmt *) parsetree);
+ break;
+
default:
elog(ERROR, "unrecognized node type: %d",
(int) nodeTag(parsetree));
return lev;
}
- if (stmt->objtype == OBJECT_DATABASE && list_length(stmt->objname) == 1)
+
+#ifdef PGXC
+
+/*
+ * ExecUtilityWithMessage:
+ * Execute the query on remote nodes in a transaction block.
+ * If this fails on one of the nodes :
+ * Add a context message containing the failed node names.
+ * Rethrow the error with the message about the failed nodes.
+ * If all are successful, just return.
+ */
+ static void
+ExecUtilityWithMessage(const char *queryString, bool sentToRemote, bool is_temp)
+{
+ PG_TRY();
+ {
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp);
+ }
+ PG_CATCH();
+ {
+
+ /*
+ * Some nodes failed. Add context about what all nodes the query
+ * failed
+ */
+ ExecNodes *coord_success_nodes = NULL;
+ ExecNodes *data_success_nodes = NULL;
+ char *msg_failed_nodes;
+
+ pgxc_all_success_nodes(&data_success_nodes, &coord_success_nodes, &msg_failed_nodes);
+ if (msg_failed_nodes)
+ errcontext("%s", msg_failed_nodes);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+
+}
+
+/*
+ * Execute a Utility statement on nodes, including Coordinators
+ * If the DDL is received from a remote Coordinator,
+ * it is not possible to push down DDL to Datanodes
+ * as it is taken in charge by the remote Coordinator.
+ */
+ static void
+ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool sentToRemote,
+ bool force_autocommit, RemoteQueryExecType exec_type, bool is_temp)
+{
+ /* Return if query is launched on no nodes */
+ if (exec_type == EXEC_ON_NONE)
+ return;
+
+ /* Nothing to be done if this statement has been sent to the nodes */
+ if (sentToRemote)
+ return;
+
+ /* If no Datanodes defined, the query cannot be launched */
+ if (NumDataNodes == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("No Datanode defined in cluster"),
+ errhint("You need to define at least 1 Datanode with "
+ "CREATE NODE.")));
+
+ if (!IsConnFromCoord())
+ {
+ RemoteQuery *step = makeNode(RemoteQuery);
+ step->combine_type = COMBINE_TYPE_SAME;
+ step->exec_nodes = nodes;
+ step->sql_statement = pstrdup(queryString);
+ step->force_autocommit = force_autocommit;
+ step->exec_type = exec_type;
+ ExecRemoteUtility(step);
+ pfree(step->sql_statement);
+ pfree(step);
+ }
+}
+
+/*
+ * ExecUtilityFindNodes
+ *
+ * Determine the list of nodes to launch query on.
+ * This depends on temporary nature of object and object type.
+ * Return also a flag indicating if relation is temporary.
+ *
+ * If object is a RULE, the object id sent is that of the object to which the
+ * rule is applicable.
+ */
+ static RemoteQueryExecType
+ExecUtilityFindNodes(ObjectType object_type,
+ Oid object_id,
+ bool *is_temp)
+{
+ RemoteQueryExecType exec_type;
+
+ switch (object_type)
+ {
+ case OBJECT_SEQUENCE:
+ *is_temp = IsTempTable(object_id);
+ exec_type = EXEC_ON_ALL_NODES;
+ break;
+
+ case OBJECT_TABLE:
+ /* Do the check on relation kind */
+ exec_type = ExecUtilityFindNodesRelkind(object_id, is_temp);
+ break;
+
+ /*
+ * Views and rules, both permanent or temporary are created
+ * on Coordinators only.
+ */
+ case OBJECT_RULE:
+ case OBJECT_VIEW:
+ case OBJECT_MATVIEW:
+ /* Check if object is a temporary view */
+ if ((*is_temp = IsTempTable(object_id)))
+ exec_type = EXEC_ON_NONE;
+ else
+ exec_type = EXEC_ON_COORDS;
+ break;
+
+ case OBJECT_INDEX:
+ /* Check if given index uses temporary tables */
+ {
+ Relation rel;
+ bool is_matview;
+
+ rel = relation_open(object_id, NoLock);
+
+ *is_temp = (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP);
+ is_matview = (rel->rd_rel->relkind == RELKIND_MATVIEW);
+
+ relation_close(rel, NoLock);
+
+ exec_type = EXEC_ON_NONE;
+ if (*is_temp)
+ {
+ if (!is_matview)
+ exec_type = EXEC_ON_DATANODES;
+ }
+ else
+ {
+ if (!is_matview)
+ exec_type = EXEC_ON_ALL_NODES;
+ else
+ exec_type = EXEC_ON_COORDS;
+ }
+ }
+ break;
+
+ default:
+ *is_temp = false;
+ exec_type = EXEC_ON_ALL_NODES;
+ break;
+ }
+
+ return exec_type;
+}
+
+/*
+ * ExecUtilityFindNodesRelkind
+ *
+ * Get node execution and temporary type
+ * for given relation depending on its relkind
+ */
+static RemoteQueryExecType
+ExecUtilityFindNodesRelkind(Oid relid, bool *is_temp)
+{
+ char relkind_str = get_rel_relkind(relid);
+ RemoteQueryExecType exec_type;
+
+ switch (relkind_str)
+ {
+ case RELKIND_SEQUENCE:
+ case RELKIND_RELATION:
+ if ((*is_temp = IsTempTable(relid)))
+ {
+ if (IsLocalTempTable(relid))
+ exec_type = EXEC_ON_NONE;
+ else
+ exec_type = EXEC_ON_DATANODES;
+ }
+ else
+ exec_type = EXEC_ON_ALL_NODES;
+ break;
+
+ case RELKIND_INDEX:
+ {
+ HeapTuple tuple;
+ Oid table_relid = InvalidOid;
+
+ tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(relid));
+ if (HeapTupleIsValid(tuple))
+ {
+ Form_pg_index index = (Form_pg_index) GETSTRUCT(tuple);
+ table_relid = index->indrelid;
+
+ /* Release system cache BEFORE looking at the parent table */
+ ReleaseSysCache(tuple);
+ return ExecUtilityFindNodesRelkind(table_relid, is_temp);
+ }
+ else
+ {
+ exec_type = EXEC_ON_NONE;
+ *is_temp = false;
+ }
+ }
+ break;
+
+ case RELKIND_VIEW:
+ if ((*is_temp = IsTempTable(relid)))
+ exec_type = EXEC_ON_NONE;
+ else
+ exec_type = EXEC_ON_COORDS;
+ break;
+
+ case RELKIND_MATVIEW:
+ /* Check if object is a temporary view */
+ if ((*is_temp = IsTempTable(relid)))
+ exec_type = EXEC_ON_NONE;
+ else
+ exec_type = EXEC_ON_COORDS;
+ break;
+
+ default:
+ *is_temp = false;
+ exec_type = EXEC_ON_ALL_NODES;
+ break;
+ }
+
+ return exec_type;
+}
+#endif
+
+#ifdef PGXC
+/*
+ * IsStmtAllowedInLockedMode
+ *
+ * Allow/Disallow a utility command while cluster is locked
+ * A statement will be disallowed if it makes such changes
+ * in catalog that are backed up by pg_dump except
+ * CREATE NODE that has to be allowed because
+ * a new node has to be created while the cluster is still
+ * locked for backup
+ */
+static bool
+IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString)
+{
+#define ALLOW 1
+#define DISALLOW 0
+
+ switch (nodeTag(parsetree))
+ {
+ /* To allow creation of temp tables */
+ case T_CreateStmt: /* CREATE TABLE */
+ {
+ CreateStmt *stmt = (CreateStmt *) parsetree;
+ if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP)
+ return ALLOW;
+ return DISALLOW;
+ }
+ break;
+
+ case T_ExecuteStmt: /*
+ * Prepared statememts can only have
+ * SELECT, INSERT, UPDATE, DELETE,
+ * or VALUES statement, there is no
+ * point stopping EXECUTE.
+ */
+ case T_CreateNodeStmt: /*
+ * This has to be allowed so that the new node
+ * can be created, while the cluster is still
+ * locked for backup
+ */
+ case T_DropNodeStmt: /*
+ * This has to be allowed so that DROP NODE
+ * can be issued to drop a node that has crashed.
+ * Otherwise system would try to acquire a shared
+ * advisory lock on the crashed node.
+ */
+
+ case T_AlterNodeStmt: /*
+ * This has to be
+ * allowed so that
+ * ALTER NODE can be
+ * issued in case a
+ * datanode or
+ * coordinator failover
+ */
+ case T_TransactionStmt:
+ case T_PlannedStmt:
+ case T_ClosePortalStmt:
+ case T_FetchStmt:
+ case T_TruncateStmt:
+ case T_CopyStmt:
+ case T_PrepareStmt: /*
+ * Prepared statememts can only have
+ * SELECT, INSERT, UPDATE, DELETE,
+ * or VALUES statement, there is no
+ * point stopping PREPARE.
+ */
+ case T_DeallocateStmt: /*
+ * If prepare is allowed the deallocate should
+ * be allowed also
+ */
+ case T_DoStmt:
+ case T_NotifyStmt:
+ case T_ListenStmt:
+ case T_UnlistenStmt:
+ case T_LoadStmt:
+ case T_ClusterStmt:
+ case T_VacuumStmt:
+ case T_ExplainStmt:
+ case T_VariableSetStmt:
+ case T_VariableShowStmt:
+ case T_DiscardStmt:
+ case T_LockStmt:
+ case T_ConstraintsSetStmt:
+ case T_CheckPointStmt:
+ case T_BarrierStmt:
+ case T_ReindexStmt:
+ case T_RemoteQuery:
+ case T_CleanConnStmt:
+#ifdef XCP
+ case T_PauseClusterStmt:
+#endif
+ return ALLOW;
+
+ default:
+ return DISALLOW;
+ }
+ return DISALLOW;
+}
+
+/*
+ * GetCommentObjectId
+ * TODO Change to return the nodes to execute the utility on
+ *
+ * Return Object ID of object commented
+ * Note: This function uses portions of the code of CommentObject,
+ * even if this code is duplicated this is done like this to facilitate
+ * merges with PostgreSQL head.
+ */
+static RemoteQueryExecType
+GetNodesForCommentUtility(CommentStmt *stmt, bool *is_temp)
+{
+ ObjectAddress address;
+ Relation relation;
+ RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES; /* By default execute on all nodes */
+ Oid object_id;
+
- char *database = strVal(linitial(stmt->objname));
++ if (stmt->objtype == OBJECT_DATABASE)
+ {
- address = get_object_address(stmt->objtype, stmt->objname, stmt->objargs,
++ char *database = strVal((Value *) stmt->object);
+ if (!OidIsValid(get_database_oid(database, true)))
+ ereport(WARNING,
+ (errcode(ERRCODE_UNDEFINED_DATABASE),
+ errmsg("database \"%s\" does not exist", database)));
+ /* No clue, return the default one */
+ return exec_type;
+ }
+
- char *rulename = strVal(llast(stmt->objname));
++ address = get_object_address(stmt->objtype, stmt->object,
+ &relation, ShareUpdateExclusiveLock, false);
+ object_id = address.objectId;
+
+ /*
+ * If the object being commented is a rule, the nodes are decided by the
+ * object to which rule is applicable, so get the that object's oid
+ */
+ if (stmt->objtype == OBJECT_RULE)
+ {
+ if (!relation && !OidIsValid(relation->rd_id))
+ {
+ /* This should not happen, but prepare for the worst */
- objname, NIL,
++ char *rulename = strVal(llast(castNode(List, stmt->object)));
+ ereport(WARNING,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("can not find relation for rule \"%s\" does not exist", rulename)));
+ object_id = InvalidOid;
+ }
+ else
+ object_id = RelationGetRelid(relation);
+ }
+
+ if (relation != NULL)
+ relation_close(relation, NoLock);
+
+ /* Commented object may not have a valid object ID, so move to default */
+ if (OidIsValid(object_id))
+ exec_type = ExecUtilityFindNodes(stmt->objtype,
+ object_id,
+ is_temp);
+ return exec_type;
+}
+
+/*
+ * GetNodesForRulesUtility
+ * Get the nodes to execute this RULE related utility statement.
+ * A rule is expanded on Coordinator itself, and does not need any
+ * existence on Datanode. In fact, if it were to exist on Datanode,
+ * there is a possibility that it would expand again
+ */
+static RemoteQueryExecType
+GetNodesForRulesUtility(RangeVar *relation, bool *is_temp)
+{
+ Oid relid = RangeVarGetRelid(relation, NoLock, true);
+ RemoteQueryExecType exec_type;
+
+ /* Skip if this Oid does not exist */
+ if (!OidIsValid(relid))
+ return EXEC_ON_NONE;
+
+ /*
+ * PGXCTODO: See if it's a temporary object, do we really need
+ * to care about temporary objects here? What about the
+ * temporary objects defined inside the rule?
+ */
+ exec_type = ExecUtilityFindNodes(OBJECT_RULE, relid, is_temp);
+ return exec_type;
+}
+
+/*
+ * TreatDropStmtOnCoord
+ * Do a pre-treatment of Drop statement on a remote Coordinator
+ */
+static void
+DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote,
+ bool *is_temp, RemoteQueryExecType *exec_type)
+{
+ bool res_is_temp = false;
+ RemoteQueryExecType res_exec_type = EXEC_ON_ALL_NODES;
+
+ /* Nothing to do if not local Coordinator */
+ if (IS_PGXC_DATANODE || IsConnFromCoord())
+ return;
+
+ switch (stmt->removeType)
+ {
+ case OBJECT_TABLE:
+ case OBJECT_SEQUENCE:
+ case OBJECT_VIEW:
+ case OBJECT_INDEX:
+ case OBJECT_MATVIEW:
+ {
+ /*
+ * Check the list of objects going to be dropped.
+ * XC does not allow yet to mix drop of temporary and
+ * non-temporary objects because this involves to rewrite
+ * query to process for tables.
+ */
+ ListCell *cell;
+ bool is_first = true;
+
+ foreach(cell, stmt->objects)
+ {
+ RangeVar *rel = makeRangeVarFromNameList((List *) lfirst(cell));
+ Oid relid;
+
+ /*
+ * Do not print result at all, error is thrown
+ * after if necessary
+ */
+ relid = RangeVarGetRelid(rel, NoLock, true);
+
+ /*
+ * In case this relation ID is incorrect throw
+ * a correct DROP error.
+ */
+ if (!OidIsValid(relid) && !stmt->missing_ok)
+ DropTableThrowErrorExternal(rel,
+ stmt->removeType,
+ stmt->missing_ok);
+
+ /* In case of DROP ... IF EXISTS bypass */
+ if (!OidIsValid(relid) && stmt->missing_ok)
+ continue;
+
+ if (is_first)
+ {
+ res_exec_type = ExecUtilityFindNodes(stmt->removeType,
+ relid,
+ &res_is_temp);
+ is_first = false;
+ }
+ else
+ {
+ RemoteQueryExecType exec_type_loc;
+ bool is_temp_loc;
+ exec_type_loc = ExecUtilityFindNodes(stmt->removeType,
+ relid,
+ &is_temp_loc);
+ if (exec_type_loc != res_exec_type ||
+ is_temp_loc != res_is_temp)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("DROP not supported for TEMP and non-TEMP objects"),
+ errdetail("You should separate TEMP and non-TEMP objects")));
+ }
+ }
+ }
+ break;
+
+ case OBJECT_RULE:
+ {
+ /*
+ * In the case of a rule we need to find the object on
+ * which the rule is dependent and define if this rule
+ * has a dependency with a temporary object or not.
+ */
+ List *objname = linitial(stmt->objects);
+ Relation relation = NULL;
+
+ get_object_address(OBJECT_RULE,
++ objname, /* XXX PG10MERGE: check if this is ok */
+ &relation,
+ AccessExclusiveLock,
+ stmt->missing_ok);
+
+ /* Do nothing if no relation */
+ if (relation && OidIsValid(relation->rd_id))
+ res_exec_type = ExecUtilityFindNodes(OBJECT_RULE,
+ relation->rd_id,
+ &res_is_temp);
+ else
+ res_exec_type = EXEC_ON_NONE;
+
+ /* Close relation if necessary */
+ if (relation)
+ relation_close(relation, NoLock);
+ }
+ break;
+
+ default:
+ res_is_temp = false;
+ res_exec_type = EXEC_ON_ALL_NODES;
+ break;
+ }
+
+ /* Save results */
+ *is_temp = res_is_temp;
+ *exec_type = res_exec_type;
+}
+#endif
* arrayfuncs.c
* Support functions for arrays.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* date.c
* implements DATE and TIME data types specified in SQL standard
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994-5, Regents of the University of California
*
*
* dbsize.c
* Database object size functions, and related inquiries
*
- * Copyright (c) 2002-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Copyright (c) 2002-2017, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/utils/adt/dbsize.c
#include "access/htup_details.h"
#include "catalog/catalog.h"
#include "catalog/namespace.h"
+ #include "catalog/pg_authid.h"
+#include "catalog/pg_namespace.h"
#include "catalog/pg_tablespace.h"
#include "commands/dbcommands.h"
#include "commands/tablespace.h"
strcmp(direntry->d_name, "..") == 0)
continue;
- snprintf(pathname, MAXPGPATH, "pg_tblspc/%s/%s_%s/%u",
+#ifdef PGXC
+ /* Postgres-XC tablespaces include node name in path */
- snprintf(pathname, MAXPGPATH, "pg_tblspc/%s/%s/%u",
++ snprintf(pathname, sizeof(pathname), "pg_tblspc/%s/%s_%s/%u",
+ direntry->d_name, TABLESPACE_VERSION_DIRECTORY, PGXCNodeName, dbOid);
+#else
+ snprintf(pathname, sizeof(pathname), "pg_tblspc/%s/%s/%u",
direntry->d_name, TABLESPACE_VERSION_DIRECTORY, dbOid);
+#endif
totalsize += db_dir_size(pathname);
}
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/timestamp.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#endif
- #define atooid(x) ((Oid) strtoul((x), NULL, 10))
-
/*
* Common subroutine for num_nulls() and num_nonnulls().
* we do better?)
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
Datum
anyarray_in(PG_FUNCTION_ARGS)
{
+#ifdef XCP
+ /*
+ * XCP version of array_in() understands prefix describing element type
+ */
+ return array_in(fcinfo);
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("cannot accept a value of type anyarray")));
+ errmsg("cannot accept a value of type %s", "anyarray")));
PG_RETURN_VOID(); /* keep compiler quiet */
+#endif
}
/*
* plan --- consider improving this someday.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
*
* src/backend/utils/adt/ri_triggers.c
*
* Functions to convert stored expressions/querytrees back to
* source text
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
simple_quote_literal(buf, stmt->payload);
}
}
- Type type;
+#ifdef PGXC
+ else if (query->utilityStmt && IsA(query->utilityStmt, CreateStmt))
+ {
+ CreateStmt *stmt = (CreateStmt *) query->utilityStmt;
+ ListCell *column;
+ const char *delimiter = "";
+ RangeVar *relation = stmt->relation;
+ bool istemp = (relation->relpersistence == RELPERSISTENCE_TEMP);
+ bool isunlogged = (relation->relpersistence == RELPERSISTENCE_UNLOGGED);
+
+ appendStringInfo(buf, "CREATE %s %s %s TABLE %s ",
+ stmt->islocal ? "LOCAL" : "",
+ istemp ? "TEMP" : "",
+ isunlogged ? "UNLOGGED" : "",
+ stmt->if_not_exists ? "IF NOT EXISTS " : "");
+
+ if (!istemp && relation->schemaname && relation->schemaname[0])
+ appendStringInfo(buf, "%s.", quote_identifier(relation->schemaname));
+ appendStringInfo(buf, "%s", quote_identifier(relation->relname));
+
+ appendStringInfo(buf, "(");
+ foreach(column, stmt->tableElts)
+ {
+ Node *node = (Node *) lfirst(column);
+
+ appendStringInfo(buf, "%s", delimiter);
+ delimiter = ", ";
+
+ if (IsA(node, ColumnDef))
+ {
+ ColumnDef *coldef = (ColumnDef *) node;
+ TypeName *typename = coldef->typeName;
+#ifdef XCP
+ appendStringInfo(buf, "%s %s",
+ quote_identifier(coldef->colname),
+ format_type_with_typemod(typename->typeOid,
+ typename->typemod));
+#else
+
+ /* error out if we have no recourse at all */
+ if (!OidIsValid(typename->typeOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("improper type oid: \"%u\"", typename->typeOid)));
+
+ /* get typename from the oid */
+ type = typeidType(typename->typeOid);
+
+ if (!HeapTupleIsValid(type))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("type \"%u\" does not exist",
+ typename->typeOid)));
+ appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname),
+ typeTypeName(type));
+ ReleaseSysCache(type);
+#endif
+ }
+ else
+ elog(ERROR, "Invalid table column definition.");
+ }
+ appendStringInfo(buf, ")");
+
+ /* Append storage parameters, like for instance WITH (OIDS) */
+ if (list_length(stmt->options) > 0)
+ {
+ Datum reloptions;
+ static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
+
+ reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps,
+ false, false);
+
+ if (reloptions)
+ {
+ Datum sep, txt;
+ /* Below is inspired from flatten_reloptions() */
+ sep = CStringGetTextDatum(", ");
+ txt = OidFunctionCall2(F_ARRAY_TO_TEXT, reloptions, sep);
+ appendStringInfo(buf, " WITH (%s)", TextDatumGetCString(txt));
+ }
+ }
+
+ /* add the on commit clauses for temporary tables */
+ switch (stmt->oncommit)
+ {
+ case ONCOMMIT_NOOP:
+ /* do nothing */
+ break;
+
+ case ONCOMMIT_PRESERVE_ROWS:
+ appendStringInfo(buf, " ON COMMIT PRESERVE ROWS");
+ break;
+
+ case ONCOMMIT_DELETE_ROWS:
+ appendStringInfo(buf, " ON COMMIT DELETE ROWS");
+ break;
+
+ case ONCOMMIT_DROP:
+ appendStringInfo(buf, " ON COMMIT DROP");
+ break;
+ }
+
+ if (stmt->distributeby)
+ {
+ /* add the on commit clauses for temporary tables */
+ switch (stmt->distributeby->disttype)
+ {
+ case DISTTYPE_REPLICATION:
+ appendStringInfo(buf, " DISTRIBUTE BY REPLICATION");
+ break;
+
+ case DISTTYPE_HASH:
+ appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", stmt->distributeby->colname);
+ break;
+
+ case DISTTYPE_ROUNDROBIN:
+ appendStringInfo(buf, " DISTRIBUTE BY ROUNDROBIN");
+ break;
+
+ case DISTTYPE_MODULO:
+ appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)",
+ quote_identifier(stmt->distributeby->colname));
+ break;
+
+ default:
+ ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Invalid distribution type")));
+
+ }
+ }
+
+ if (stmt->subcluster)
+ {
+ ListCell *cell;
+
+ switch (stmt->subcluster->clustertype)
+ {
+ case SUBCLUSTER_NODE:
+ appendStringInfo(buf, " TO NODE (");
+
+ /* Add node members */
+ Assert(stmt->subcluster->members);
+ foreach(cell, stmt->subcluster->members)
+ {
+ appendStringInfo(buf, " %s",
+ quote_identifier(strVal(lfirst(cell))));
+ if (cell->next)
+ appendStringInfo(buf, ",");
+ }
+ appendStringInfo(buf, ")");
+ break;
+
+ case SUBCLUSTER_GROUP:
+ appendStringInfo(buf, " TO GROUP");
+
+ /* Add group members */
+ Assert(stmt->subcluster->members);
+ foreach(cell, stmt->subcluster->members)
+ {
+ appendStringInfo(buf, " %s",
+ quote_identifier(strVal(lfirst(cell))));
+ if (cell->next)
+ appendStringInfo(buf, ",");
+ }
+ break;
+
+ case SUBCLUSTER_NONE:
+ default:
+ /* Nothing to do */
+ break;
+ }
+ }
+ }
+#endif
else
{
/* Currently only NOTIFY utility commands can appear in rules */
* version.c
* Returns the PostgreSQL version string
*
- * Copyright (c) 1998-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Copyright (c) 1998-2017, PostgreSQL Global Development Group
*
* IDENTIFICATION
*
* problems can be overcome cheaply.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* lsyscache.c
* Convenience routines for common queries in the system catalog cache.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
return typid;
}
+#ifdef PGXC
+/*
+ * get_typename
+ * Get type name for given type ID
+ */
+char *
+get_typename(Oid typid)
+{
+ HeapTuple tuple;
+ Form_pg_type typeForm;
+ char *result;
+
+ tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for type %u", typid);
+
+ typeForm = (Form_pg_type) GETSTRUCT(tuple);
+ result = pstrdup(NameStr(typeForm->typname));
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+/*
+ * get_pgxc_nodeoid
+ * Obtain PGXC Node Oid for given node name
+ * Return Invalid Oid if object does not exist
+ */
+Oid
+get_pgxc_nodeoid(const char *nodename)
+{
+ return GetSysCacheOid1(PGXCNODENAME,
+ PointerGetDatum(nodename));
+}
+
+/*
+ * get_pgxc_nodename
+ * Get node name for given Oid
+ */
+char *
+get_pgxc_nodename(Oid nodeid)
+{
+ HeapTuple tuple;
+ Form_pgxc_node nodeForm;
+ char *result;
+
+ tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for node %u", nodeid);
+
+ nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ result = pstrdup(NameStr(nodeForm->node_name));
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+ /*
+ * get_pgxc_node_id
+ * Get node identifier for a given Oid
+ */
+uint32
+get_pgxc_node_id(Oid nodeid)
+{
+ HeapTuple tuple;
+ Form_pgxc_node nodeForm;
+ uint32 result;
+
+ if (nodeid == InvalidOid)
+ return 0;
+
+ tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for node %u", nodeid);
+
+ nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ result = nodeForm->node_id;
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+/*
+ * get_pgxc_nodetype
+ * Get node type for given Oid
+ */
+char
+get_pgxc_nodetype(Oid nodeid)
+{
+ HeapTuple tuple;
+ Form_pgxc_node nodeForm;
+ char result;
+
+ tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for node %u", nodeid);
+
+ nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ result = nodeForm->node_type;
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+/*
+ * get_pgxc_nodeport
+ * Get node port for given Oid
+ */
+int
+get_pgxc_nodeport(Oid nodeid)
+{
+ HeapTuple tuple;
+ Form_pgxc_node nodeForm;
+ int result;
+
+ tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for node %u", nodeid);
+
+ nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ result = nodeForm->node_port;
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+/*
+ * get_pgxc_nodehost
+ * Get node host for given Oid
+ */
+char *
+get_pgxc_nodehost(Oid nodeid)
+{
+ HeapTuple tuple;
+ Form_pgxc_node nodeForm;
+ char *result;
+
+ tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for node %u", nodeid);
+
+ nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ result = pstrdup(NameStr(nodeForm->node_host));
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+/*
+ * is_pgxc_nodepreferred
+ * Determine if node is a preferred one
+ */
+bool
+is_pgxc_nodepreferred(Oid nodeid)
+{
+ HeapTuple tuple;
+ Form_pgxc_node nodeForm;
+ bool result;
+
+ tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for node %u", nodeid);
+
+ nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ result = nodeForm->nodeis_preferred;
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+/*
+ * is_pgxc_nodeprimary
+ * Determine if node is a primary one
+ */
+bool
+is_pgxc_nodeprimary(Oid nodeid)
+{
+ HeapTuple tuple;
+ Form_pgxc_node nodeForm;
+ bool result;
+
+ tuple = SearchSysCache1(PGXCNODEOID, ObjectIdGetDatum(nodeid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for node %u", nodeid);
+
+ nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+ result = nodeForm->nodeis_primary;
+ ReleaseSysCache(tuple);
+
+ return result;
+}
+
+/*
+ * get_pgxc_groupoid
+ * Obtain PGXC Group Oid for given group name
+ * Return Invalid Oid if group does not exist
+ */
+Oid
+get_pgxc_groupoid(const char *groupname)
+{
+ return GetSysCacheOid1(PGXCGROUPNAME,
+ PointerGetDatum(groupname));
+}
+
+/*
+ * get_pgxc_groupmembers
+ * Obtain PGXC Group members for given group Oid
+ * Return number of members and their list
+ *
+ * Member list is returned as a palloc'd array
+ */
+int
+get_pgxc_groupmembers(Oid groupid, Oid **members)
+{
+ HeapTuple tuple;
+ Form_pgxc_group groupForm;
+ int nmembers;
+
+ tuple = SearchSysCache1(PGXCGROUPOID, ObjectIdGetDatum(groupid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for group %u", groupid);
+
+ groupForm = (Form_pgxc_group) GETSTRUCT(tuple);
+ nmembers = (int) groupForm->group_members.dim1;
+ *members = (Oid *) palloc(nmembers * sizeof(Oid));
+ memcpy(*members, groupForm->group_members.values, nmembers * sizeof(Oid));
+
+ ReleaseSysCache(tuple);
+ return nmembers;
+}
+
++char *
++get_pgxc_groupname(Oid groupid)
++{
++ HeapTuple tuple;
++ Form_pgxc_group groupForm;
++ char *result;
++
++ tuple = SearchSysCache1(PGXCGROUPOID,
++ ObjectIdGetDatum(groupid));
++
++ if (!HeapTupleIsValid(tuple))
++ elog(ERROR, "cache lookup failed for group %u", groupid);
++
++ groupForm = (Form_pgxc_group) GETSTRUCT(tuple);
++ result = pstrdup(NameStr(groupForm->group_name));
++ ReleaseSysCache(tuple);
++ return result;
++}
+/*
+ * get_pgxc_classnodes
+ * Obtain PGXC class Datanode list for given relation Oid
+ * Return number of Datanodes and their list
+ *
+ * Node list is returned as a palloc'd array
+ */
+int
+get_pgxc_classnodes(Oid tableid, Oid **nodes)
+{
+ HeapTuple tuple;
+ Form_pgxc_class classForm;
+ int numnodes;
+
+ tuple = SearchSysCache1(PGXCCLASSRELID, ObjectIdGetDatum(tableid));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for relation %u", tableid);
+
+ classForm = (Form_pgxc_class) GETSTRUCT(tuple);
+ numnodes = (int) classForm->nodeoids.dim1;
+ *nodes = (Oid *) palloc(numnodes * sizeof(Oid));
+ memcpy(*nodes, classForm->nodeoids.values, numnodes * sizeof(Oid));
+
+ ReleaseSysCache(tuple);
+ return numnodes;
+}
+#endif
+
/*
* get_typavgwidth
*
* be infrequent enough that more-detailed tracking is not worth the effort.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* commandTag: compile-time-constant tag for query, or NULL if empty query
*/
CachedPlanSource *
- CreateCachedPlan(Node *raw_parse_tree,
+ CreateCachedPlan(RawStmt *raw_parse_tree,
const char *query_string,
+#ifdef PGXC
+ const char *stmt_name,
+#endif
const char *commandTag)
{
CachedPlanSource *plansource;
switch (ChoosePortalStrategy(stmt_list))
{
+#ifdef XCP
+ case PORTAL_DISTRIBUTED:
+#endif
case PORTAL_ONE_SELECT:
case PORTAL_ONE_MOD_WITH:
- query = (Query *) linitial(stmt_list);
- Assert(IsA(query, Query));
+ query = linitial_node(Query, stmt_list);
return ExecCleanTypeFromTL(query->targetList, false);
case PORTAL_ONE_RETURNING:
* relcache.c
* POSTGRES relation descriptor cache code
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
* syscache.c
* System cache management routines
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
},
8
},
+#ifdef PGXC
+ {PgxcClassRelationId, /* PGXCCLASSRELID */
+ PgxcClassPgxcRelIdIndexId,
+ 1,
+ {
+ Anum_pgxc_class_pcrelid,
+ 0,
+ 0,
+ 0
+ },
+ 1024
+ },
+ {PgxcGroupRelationId, /* PGXCGROUPNAME */
+ PgxcGroupGroupNameIndexId,
+ 1,
+ {
+ Anum_pgxc_group_name,
+ 0,
+ 0,
+ 0
+ },
+ 256
+ },
+ {PgxcGroupRelationId, /* PGXCGROUPOID */
+ PgxcGroupOidIndexId,
+ 1,
+ {
+ ObjectIdAttributeNumber,
+ 0,
+ 0,
+ 0
+ },
+ 256
+ },
+ {PgxcNodeRelationId, /* PGXCNODENAME */
+ PgxcNodeNodeNameIndexId,
+ 1,
+ {
+ Anum_pgxc_node_name,
+ 0,
+ 0,
+ 0
+ },
+ 256
+ },
+ {PgxcNodeRelationId, /* PGXCNODEOID */
+ PgxcNodeOidIndexId,
+ 1,
+ {
+ ObjectIdAttributeNumber,
+ 0,
+ 0,
+ 0
+ },
+ 256
+ },
+ {PgxcNodeRelationId, /* PGXCNODEIDENTIFIER */
+ PgxcNodeNodeIdIndexId,
+ 1,
+ {
+ Anum_pgxc_node_id,
+ 0,
+ 0,
+ 0
+ },
+ 256
+ },
+#endif
+ {PartitionedRelationId, /* PARTRELID */
+ PartitionedRelidIndexId,
+ 1,
+ {
+ Anum_pg_partitioned_table_partrelid,
+ 0,
+ 0,
+ 0
+ },
+ 32
+ },
{ProcedureRelationId, /* PROCNAMEARGSNSP */
ProcedureNameArgsNspIndexId,
3,
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "pgxc/execRemote.h"
+#endif
+ /* In this module, access gettext() via err_gettext() */
#undef _
#define _(x) err_gettext(x)
- static const char *err_gettext(const char *str) pg_attribute_format_arg(1);
- static void set_errdata_field(MemoryContextData *cxt, char **ptr, const char *str);
+#ifdef USE_MODULE_MSGIDS
+static void AtProcExit_MsgModule(int code, Datum arg);
+static bool pg_msgmodule_enable_disable(int32 pid, bool enable);
+#endif
/* Global variables */
ErrorContextCallback *error_context_stack = NULL;
static const char *get_errno_symbol(int errnum);
static const char *error_severity(int elevel);
static void append_with_tabs(StringInfo buf, const char *str);
-static bool is_log_level_output(int elevel, int log_min_level);
+static bool is_log_level_output(int elevel,
+#ifdef USE_MODULE_MSGIDS
+ int moduleid,
+ int fileid,
+ int msgid,
+#endif
+ int log_min_level);
- static void write_pipe_chunks(char *data, int len, int dest);
- static void write_csvlog(ErrorData *edata);
- static void setup_formatted_log_time(void);
- static void setup_formatted_start_time(void);
+#ifdef USE_MODULE_MSGIDS
+typedef struct MsgModuleCtlStruct
+{
+ bool mm_enabled;
+ bool mm_persistent;
+ char mm_flags[FLEXIBLE_ARRAY_MEMBER];
+} MsgModuleCtlStruct;
+
+#define StartOfBackendFlags \
+ ( \
+ PGXL_MSG_MAX_MODULES * \
+ PGXL_MSG_MAX_FILEIDS_PER_MODULE * \
+ PGXL_MSG_MAX_MSGIDS_PER_FILE \
+ )
+
+#define SizeOfMsgModuleCtlStruct \
+ ( \
+ offsetof(MsgModuleCtlStruct, mm_flags) + \
+ StartOfBackendFlags + \
+ MaxBackends \
+ )
+static MsgModuleCtlStruct *MsgModuleCtl;
+#endif
/*
* in_error_recursion_trouble --- are we at risk of infinite error recursion?
MemoryContext oldcontext;
if (!errstart(edata->elevel, edata->filename, edata->lineno,
+#ifdef USE_MODULE_MSGIDS
+ edata->moduleid,
+ edata->fileid, edata->msgid,
+#endif
edata->funcname, NULL))
- return;
+ return; /* error is not to be reported at all */
newedata = &errordata[errordata_stack_depth];
- oldcontext = MemoryContextSwitchTo(edata->assoc_context);
+ recursion_depth++;
+ oldcontext = MemoryContextSwitchTo(newedata->assoc_context);
- /* Copy the supplied fields to the error stack. */
- if (edata->sqlerrcode > 0)
+ /* Copy the supplied fields to the error stack entry. */
+ if (edata->sqlerrcode != 0)
newedata->sqlerrcode = edata->sqlerrcode;
if (edata->message)
newedata->message = pstrdup(edata->message);
* globals.c
* global variable declarations
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* miscinit.c
* miscellaneous initialization support stuff
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include "libpq/libpq.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
+#ifdef XCP
+#include "pgxc/execRemote.h"
+#endif
+ #include "pgstat.h"
#include "postmaster/autovacuum.h"
#include "postmaster/postmaster.h"
#include "storage/fd.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/memutils.h"
+#ifdef XCP
+#include "utils/snapmgr.h"
+#endif
#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+ #include "utils/varlena.h"
#define DIRECTORY_LOCK_FILE "postmaster.pid"
* postinit.c
* postgres initialization utilities
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* just document that the connection limit is approximate.
*/
if (dbform->datconnlimit >= 0 &&
+#ifdef XCP
+ IS_PGXC_COORDINATOR &&
+#endif
!am_superuser &&
- CountDBBackends(MyDatabaseId) > dbform->datconnlimit)
+ CountDBConnections(MyDatabaseId) > dbform->datconnlimit)
ereport(FATAL,
(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
errmsg("too many connections for database \"%s\"",
before_shmem_exit(ShutdownPostgres, 0);
/* The autovacuum launcher is done here */
- if (IsAutoVacuumLauncherProcess())
+ if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess())
+ {
+ /* report this backend in the PgBackendStatus array */
+ pgstat_bestart();
+
return;
+ }
/*
* Start a new transaction here before first access to db, and get a
* See src/backend/utils/misc/README for more information.
*
*
- * Copyright (c) 2000-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Copyright (c) 2000-2017, PostgreSQL Global Development Group
*
* IDENTIFICATION
#include "access/commit_ts.h"
#include "access/gin.h"
+#ifdef PGXC
+#include "access/gtm.h"
+#include "pgxc/pgxc.h"
+#endif
+ #include "access/rmgr.h"
#include "access/transam.h"
#include "access/twophase.h"
#include "access/xact.h"
#include "parser/parser.h"
#include "parser/scansup.h"
#include "pgstat.h"
+#ifdef PGXC
+#include "commands/tablecmds.h"
+#include "commands/trigger.h"
+#include "nodes/nodes.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/locator.h"
+#include "pgxc/planner.h"
+#include "pgxc/poolmgr.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/xc_maintenance_mode.h"
+#include "storage/procarray.h"
+#endif
+#ifdef XCP
+#include "commands/sequence.h"
+#include "parser/parse_utilcmd.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/squeue.h"
+#include "utils/snapmgr.h"
+#endif
#include "postmaster/autovacuum.h"
- #include "postmaster/bgworker.h"
+ #include "postmaster/bgworker_internals.h"
#include "postmaster/bgwriter.h"
#include "postmaster/postmaster.h"
#include "postmaster/syslogger.h"
true,
NULL, NULL, NULL
},
+#ifdef PGXC
+ {
+ {"enable_fast_query_shipping", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the planner's use of fast query shipping to ship query directly to datanode."),
+ NULL
+ },
+ &enable_fast_query_shipping,
+ true,
+ NULL, NULL, NULL
+ },
+ {
+ {"loose_constraints", PGC_USERSET, COORDINATORS,
+ gettext_noop("Relax enforcing of constraints"),
+ gettext_noop("If enabled then constraints like foreign keys "
+ "are not enforced. It's the users responsibility "
+ "to maintain referential integrity at the application "
+ "level")
+ },
+ &loose_constraints,
+ false,
+ NULL, NULL, NULL
+ },
+ {
+ {"gtm_backup_barrier", PGC_SUSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables coordinator to report barrier id to GTM for backup."),
+ NULL
+ },
+ >m_backup_barrier,
+ false,
+ NULL, NULL, NULL
+ },
+ {
+ {"enable_datanode_row_triggers", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+ gettext_noop("Enables datanode-only ROW triggers"),
+ NULL
+ },
+ &enable_datanode_row_triggers,
+ false,
+ NULL, NULL, NULL
+ },
+#endif
+ {
+ {"enable_gathermerge", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the planner's use of gather merge plans."),
+ NULL
+ },
+ &enable_gathermerge,
+ true,
+ NULL, NULL, NULL
+ },
+
{
{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
gettext_noop("Enables genetic query optimization."),
check_TSCurrentConfig, assign_TSCurrentConfig, NULL
},
+#ifdef PGXC
+ {
+ {"gtm_host", PGC_POSTMASTER, GTM,
+ gettext_noop("Host name or address of GTM"),
+ NULL
+ },
+ &GtmHost,
+ "localhost",
+ NULL, NULL, NULL
+ },
+
+ {
+ {"pgxc_node_name", PGC_POSTMASTER, GTM,
+ gettext_noop("The Coordinator or Datanode name."),
+ NULL,
+ GUC_NO_RESET_ALL | GUC_IS_NAME
+ },
+ &PGXCNodeName,
+ "",
+ NULL, NULL, NULL
+ },
+#endif
+#ifdef XCP
+ {
+ {"parentnode", PGC_BACKEND, CONN_AUTH,
+ gettext_noop("Sets the name of the parent data node"),
+ NULL
+ },
+ &parentPGXCNode,
+ NULL,
+ NULL, NULL, NULL
+ },
+#endif /* XCP */
{
- {"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY,
+ {"ssl_ciphers", PGC_SIGHUP, CONN_AUTH_SECURITY,
gettext_noop("Sets the list of allowed SSL ciphers."),
NULL,
GUC_SUPERUSER_ONLY
#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching
#max_worker_processes = 8 # (change requires restart)
- #max_parallel_workers_per_gather = 2 # taken from max_worker_processes
+ #max_parallel_workers_per_gather = 2 # taken from max_parallel_workers
+ #max_parallel_workers = 8 # maximum number of max_worker_processes that
+ # can be used in parallel queries
#old_snapshot_threshold = -1 # 1min-60d; -1 disables; 0 is immediate
- # (change requires restart)
- #backend_flush_after = 0 # 0 disables, default is 0
+ # (change requires restart)
+ #backend_flush_after = 0 # measured in pages, 0 disables
+# - Shared queues -
+
+#shared_queues = 64 # min 16
+#shared_queue_size = 64KB # min 16KB
#------------------------------------------------------------------------------
# WRITE AHEAD LOG
#cpu_tuple_cost = 0.01 # same scale as above
#cpu_index_tuple_cost = 0.005 # same scale as above
#cpu_operator_cost = 0.0025 # same scale as above
+#network_byte_cost = 0.001 # same scale as above
+#remote_query_cost = 100.0 # same scale as above
#parallel_tuple_cost = 0.1 # same scale as above
#parallel_setup_cost = 1000.0 # same scale as above
- #min_parallel_relation_size = 8MB
+ #min_parallel_table_scan_size = 8MB
+ #min_parallel_index_scan_size = 512kB
#effective_cache_size = 4GB
# - Genetic Query Optimizer -
return out;
}
+#ifdef PGXC
+#include "gen_alloc.h"
+
+void *current_memcontext(void);
+
+void *current_memcontext()
+{
+ return((void *)CurrentMemoryContext);
+}
+
+void *allocTopCxt(size_t s)
+{
+ return MemoryContextAlloc(TopMemoryContext, (Size)s);
+}
+
+Gen_Alloc genAlloc_class = {(void *)MemoryContextAlloc,
+ (void *)MemoryContextAllocZero,
+ (void *)repalloc,
+ (void *)pfree,
+ (void *)current_memcontext,
+ (void *)allocTopCxt};
+
+#endif
++
+ /*
+ * Make copy of string with all trailing newline characters removed.
+ */
+ char *
+ pchomp(const char *in)
+ {
+ size_t n;
+
+ n = strlen(in);
+ while (n > 0 && in[n - 1] == '\n')
+ n--;
+ return pnstrdup(in, n);
+ }
* doesn't actually run the executor for them.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
PrintFileLeakWarning(res);
FileClose(res);
}
- /* Clean up index scans too */
- ReleaseResources_hash();
+
+ /* Ditto for prepared statements */
+ while (ResourceArrayGetAny(&(owner->prepstmts), &foundres))
+ {
+ char *stmt = (char *) DatumGetPointer(foundres);
+
+ if (isCommit)
+ PrintPreparedStmtLeakWarning(stmt);
+ DropPreparedStatement(stmt, false);
+ }
+
}
/* Let add-on modules get a chance too */
* code we determine the number of tapes M on the basis of workMem: we want
* workMem/M to be large enough that we read a fair amount of data each time
* we preread from a tape, so as to maintain the locality of access described
- * above. Nonetheless, with large workMem we can have many tapes.
+ * above. Nonetheless, with large workMem we can have many tapes (but not
+ * too many -- see the comments in tuplesort_merge_order).
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
void (*readtup) (Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
- /*
- * Function to move a caller tuple. This is usually implemented as a
- * memmove() shim, but function may also perform additional fix-up of
- * caller tuple where needed. Batch memory support requires the movement
- * of caller tuples from one location in memory to another.
- */
- void (*movetup) (void *dest, void *src, unsigned int len);
-
+#ifdef PGXC
+ /*
+ * Function to read length of next stored tuple.
+ * Used as 'len' parameter for readtup function.
+ */
+ unsigned int (*getlen) (Tuplesortstate *state, int tapenum, bool eofOK);
+#endif
+
/*
* This array holds the tuples now in sort memory. If we are in state
* INITIAL, the tuples are in no particular order; if we are in state
#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup))
#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
- #define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len))
- #define LACKMEM(state) ((state)->availMem < 0 && !(state)->batchUsed)
+#ifdef PGXC
+#define GETLEN(state,tape,eofOK) ((*(state)->getlen) (state, tape, eofOK))
+#endif
+ #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
#define USEMEM(state,amt) ((state)->availMem -= (amt))
#define FREEMEM(state,amt) ((state)->availMem += (amt))
SortTuple *stup);
static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
- static void movetup_heap(void *dest, void *src, unsigned int len);
+#ifdef PGXC
+static unsigned int getlen_datanode(Tuplesortstate *state, int tapenum,
+ bool eofOK);
+static void readtup_datanode(Tuplesortstate *state, SortTuple *stup,
+ int tapenum, unsigned int len);
+#endif
static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
state->copytup = copytup_heap;
state->writetup = writetup_heap;
state->readtup = readtup_heap;
- state->movetup = movetup_heap;
+#ifdef PGXC
+ state->getlen = getlen;
+#endif
state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
state->abbrevNext = 10;
state->copytup = copytup_cluster;
state->writetup = writetup_cluster;
state->readtup = readtup_cluster;
- state->movetup = movetup_cluster;
+#ifdef PGXC
+ state->getlen = getlen;
+#endif
state->abbrevNext = 10;
state->indexInfo = BuildIndexInfo(indexRel);
state->copytup = copytup_index;
state->writetup = writetup_index;
state->readtup = readtup_index;
- state->movetup = movetup_index;
+#ifdef PGXC
+ state->getlen = getlen;
+#endif
state->abbrevNext = 10;
state->heapRel = heapRel;
state->copytup = copytup_index;
state->writetup = writetup_index;
state->readtup = readtup_index;
- state->movetup = movetup_index;
+#ifdef PGXC
+ state->getlen = getlen;
+#endif
state->heapRel = heapRel;
state->indexRel = indexRel;
state->copytup = copytup_datum;
state->writetup = writetup_datum;
state->readtup = readtup_datum;
- state->movetup = movetup_datum;
+#ifdef PGXC
+ state->getlen = getlen;
+#endif
state->abbrevNext = 10;
state->datumType = datumType;
return state;
}
- state->batchUsed = false;
+#ifdef PGXC
+/*
+ * Tuples are coming from source where they are already sorted.
+ * It is pretty much like sorting heap tuples but no need to load sorter.
+ * Sorter initial status is final merge, and correct readtup and getlen
+ * callbacks should be passed in.
+ * Usage pattern of the merge sorter
+ * tuplesort_begin_merge
+ * while (tuple = tuplesort_gettuple())
+ * {
+ * // process
+ * }
+ * tuplesort_end_merge
+ */
+Tuplesortstate *
+tuplesort_begin_merge(TupleDesc tupDesc,
+ int nkeys, AttrNumber *attNums,
+ Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags,
+ ResponseCombiner *combiner,
+ int workMem)
+{
+ Tuplesortstate *state = tuplesort_begin_common(workMem, false);
+ MemoryContext oldcontext;
+ int i;
+
+ oldcontext = MemoryContextSwitchTo(state->sortcontext);
+
+ AssertArg(nkeys > 0);
+ AssertArg(combiner);
+
+#ifdef TRACE_SORT
+ if (trace_sort)
+ elog(LOG,
+ "begin merge sort: nkeys = %d, workMem = %d", nkeys, workMem);
+#endif
+
+ state->nKeys = nkeys;
+
+ TRACE_POSTGRESQL_SORT_START(MERGE_SORT,
+ false, /* no unique check */
+ nkeys,
+ workMem,
+ false);
+
+ state->combiner = combiner;
+ state->comparetup = comparetup_heap;
+ state->copytup = NULL;
+ state->writetup = NULL;
+ state->readtup = readtup_datanode;
+ state->getlen = getlen_datanode;
+
+ state->tuples = false;
- /*
- * logical tape in this case is a sorted stream
- */
- state->maxTapes = combiner->conn_count;
- state->tapeRange = combiner->conn_count;
-
- state->mergeactive = (bool *) palloc0(combiner->conn_count * sizeof(bool));
- state->mergenext = (int *) palloc0(combiner->conn_count * sizeof(int));
- state->mergelast = (int *) palloc0(combiner->conn_count * sizeof(int));
- state->mergeavailslots = (int *) palloc0(combiner->conn_count * sizeof(int));
- state->mergeavailmem = (int64 *) palloc0(combiner->conn_count * sizeof(int64));
-
- state->mergetuples = (char **) palloc0(combiner->conn_count * sizeof(char *));
- state->mergecurrent = (char **) palloc0(combiner->conn_count * sizeof(char *));
- state->mergetail = (char **) palloc0(combiner->conn_count * sizeof(char *));
- state->mergeoverflow = (char **) palloc0(combiner->conn_count * sizeof(char *));
-
+
+ state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
+ state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
+
+ for (i = 0; i < nkeys; i++)
+ {
+ SortSupport sortKey = state->sortKeys + i;
+
+ AssertArg(attNums[i] != 0);
+ AssertArg(sortOperators[i] != 0);
+
+ sortKey->ssup_cxt = CurrentMemoryContext;
+ sortKey->ssup_collation = sortCollations[i];
+ sortKey->ssup_nulls_first = nullsFirstFlags[i];
+ sortKey->ssup_attno = attNums[i];
+
+ PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
+ }
+
- beginmerge(state, state->tuples);
+ state->tp_runs = (int *) palloc0(combiner->conn_count * sizeof(int));
+ state->tp_dummy = (int *) palloc0(combiner->conn_count * sizeof(int));
+ state->tp_tapenum = (int *) palloc0(combiner->conn_count * sizeof(int));
+ /* mark each stream (tape) has one run */
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ state->tp_runs[i] = 1;
+ state->tp_tapenum[i] = i;
+ }
++ beginmerge(state);
+ state->status = TSS_FINALMERGE;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return state;
+}
+#endif
+
/*
* tuplesort_set_bound
*
}
/*
- * mergeprereadone - load tuples from one merge input tape
+ * mergereadnext - read next tuple from one merge input tape
*
- * Read tuples from the specified tape until it has used up its free memory
- * or array slots; but ensure that we have at least one tuple, if any are
- * to be had.
+ * Returns false on EOF.
*/
- static void
- mergeprereadone(Tuplesortstate *state, int srcTape)
+ static bool
+ mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
{
unsigned int tuplen;
- SortTuple stup;
- int tupIndex;
- int64 priorAvail,
- spaceUsed;
if (!state->mergeactive[srcTape])
- return; /* tape's run is already exhausted */
-
- /*
- * Manage per-tape availMem. Only actually matters when batch memory not
- * in use.
- */
- priorAvail = state->availMem;
- state->availMem = state->mergeavailmem[srcTape];
+ return false; /* tape's run is already exhausted */
- /*
- * When batch memory is used if final on-the-fly merge, only mergeoverflow
- * test is relevant; otherwise, only LACKMEM() test is relevant.
- */
- while ((state->mergeavailslots[srcTape] > 0 &&
- state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) ||
- state->mergenext[srcTape] == 0)
- {
- /* read next tuple, if any */
+ /* read next tuple, if any */
+#ifdef PGXC
+ if ((tuplen = GETLEN(state, srcTape, true)) == 0)
+#else
- if ((tuplen = getlen(state, srcTape, true)) == 0)
+ if ((tuplen = getlen(state, srcTape, true)) == 0)
+#endif
- {
- state->mergeactive[srcTape] = false;
- break;
- }
- READTUP(state, &stup, srcTape, tuplen);
- /* find a free slot in memtuples[] for it */
- tupIndex = state->mergefreelist;
- if (tupIndex)
- state->mergefreelist = state->memtuples[tupIndex].tupindex;
- else
- {
- tupIndex = state->mergefirstfree++;
- Assert(tupIndex < state->memtupsize);
- }
- state->mergeavailslots[srcTape]--;
- /* store tuple, append to list for its tape */
- stup.tupindex = 0;
- state->memtuples[tupIndex] = stup;
- if (state->mergelast[srcTape])
- state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex;
- else
- state->mergenext[srcTape] = tupIndex;
- state->mergelast[srcTape] = tupIndex;
+ {
+ state->mergeactive[srcTape] = false;
+ return false;
}
- /* update per-tape and global availmem counts */
- spaceUsed = state->mergeavailmem[srcTape] - state->availMem;
- state->mergeavailmem[srcTape] = state->availMem;
- state->availMem = priorAvail - spaceUsed;
+ READTUP(state, stup, srcTape, tuplen);
+
+ return true;
}
/*
&stup->isnull1);
}
- static void
- movetup_heap(void *dest, void *src, unsigned int len)
- {
- memmove(dest, src, len);
- }
-
+#ifdef PGXC
+static unsigned int
+getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK)
+{
+ ResponseCombiner *combiner = state->combiner;
+ TupleTableSlot *dstslot = combiner->ss.ps.ps_ResultTupleSlot;
+ TupleTableSlot *slot;
+
+ combiner->current_conn = tapenum;
+ slot = FetchTuple(combiner);
+ if (TupIsNull(slot))
+ {
+ if (eofOK)
+ return 0;
+ else
+ elog(ERROR, "unexpected end of data");
+ }
+
+ if (slot != dstslot)
+ ExecCopySlot(dstslot, slot);
+
+ return 1;
+}
+
+static void
+readtup_datanode(Tuplesortstate *state, SortTuple *stup,
+ int tapenum, unsigned int len)
+{
+ TupleTableSlot *slot = state->combiner->ss.ps.ps_ResultTupleSlot;
+ MinimalTuple tuple;
+ HeapTupleData htup;
+
+ Assert(!TupIsNull(slot));
+
+ /* copy the tuple into sort storage */
+ tuple = ExecCopySlotMinimalTuple(slot);
+ stup->tuple = (void *) tuple;
+ USEMEM(state, GetMemoryChunkSpace(tuple));
+ /* set up first-column key value */
+ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
+ htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
+ stup->datum1 = heap_getattr(&htup,
+ state->sortKeys[0].ssup_attno,
+ state->tupDesc,
+ &stup->isnull1);
+}
+#endif /* PGXC */
+
/*
* Routines specialized for the CLUSTER case (HeapTuple data, with
* comparisons per a btree index definition)
* before switching to the other state or activating a different read pointer.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
bool truncated; /* tuplestore_trim has removed tuples? */
int64 availMem; /* remaining memory available, in bytes */
int64 allowedMem; /* total memory allowed, in bytes */
+ int64 tuples; /* number of tuples added */
BufFile *myfile; /* underlying file, or NULL if none */
MemoryContext context; /* memory context for holding tuples */
+#ifdef XCP
+ MemoryContext tmpcxt; /* memory context for holding temporary data */
+#endif
ResourceOwner resowner; /* resowner for holding temp files */
/*
int i;
ResourceOwner oldowner;
+ if (state->stat_name)
+ state->stat_write_count++;
+ state->tuples++;
switch (state->status)
{
* destroyed at the end of each transaction.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* transaction).
*
* These arrangements let us reset MyPgXact->xmin when there are no snapshots
- * referenced by this transaction. (One possible improvement would be to be
- * able to advance Xmin when the snapshot with the earliest Xmin is no longer
- * referenced. That's a bit harder though, it requires more locking, and
- * anyway it should be rather uncommon to keep temporary snapshots referenced
- * for too long.)
+ * referenced by this transaction, and advance it when the one with oldest
+ * Xmin is no longer referenced. For simplicity however, only registered
+ * snapshots not active snapshots participate in tracking which one is oldest;
+ * we don't try to change MyPgXact->xmin except when the active-snapshot
+ * stack is empty.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node);
}
else
- CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
+ CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData, false);
- /* Don't allow catalog snapshot to be older than xact snapshot. */
- CatalogSnapshotStale = true;
-
FirstSnapshotSet = true;
return CurrentSnapshot;
}
if (IsolationUsesXactSnapshot())
+ {
+#ifdef PGXC
+ /*
+ * Consider this test case taken from portals.sql
+ *
+ * CREATE TABLE cursor (a int, b int) distribute by replication;
+ * INSERT INTO cursor VALUES (10);
+ * BEGIN;
+ * SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+ * DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
+ * INSERT INTO cursor VALUES (2);
+ * FETCH ALL FROM c1;
+ * would result in
+ * ERROR: attempted to lock invisible tuple
+ * because FETCH would be sent as a select to the remote nodes
+ * with command id 0, whereas the command id would be 2
+ * in the current snapshot.
+ * (1 sent by Coordinator due to declare cursor &
+ * 2 because of the insert inside the transaction)
+ * The command id should therefore be updated in the
+ * current snapshot.
+ */
+ if (IsConnFromCoord() || IsConnFromDatanode())
+ SnapshotSetCommandId(GetCurrentCommandId(false));
+#endif
return CurrentSnapshot;
+ }
/* Don't allow catalog snapshot to be older than xact snapshot. */
- CatalogSnapshotStale = true;
+ InvalidateCatalogSnapshot();
- CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
+ CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData, false);
return CurrentSnapshot;
}
* scan a relation for which neither catcache nor snapshot invalidations
* are sent, we must refresh the snapshot every time.
*/
- if (!CatalogSnapshotStale && !RelationInvalidatesSnapshotsOnly(relid) &&
+ if (CatalogSnapshot &&
+ !RelationInvalidatesSnapshotsOnly(relid) &&
!RelationHasSysCache(relid))
- CatalogSnapshotStale = true;
+ InvalidateCatalogSnapshot();
- if (CatalogSnapshotStale)
+ if (CatalogSnapshot == NULL)
{
/* Get new snapshot. */
- CatalogSnapshot = GetSnapshotData(&CatalogSnapshotData);
+ CatalogSnapshot = GetSnapshotData(&CatalogSnapshotData, true);
/*
- * Mark new snapshost as valid. We must do this last, in case an
- * ERROR occurs inside GetSnapshotData().
+ * Make sure the catalog snapshot will be accounted for in decisions
+ * about advancing PGXACT->xmin. We could apply RegisterSnapshot, but
+ * that would result in making a physical copy, which is overkill; and
+ * it would also create a dependency on some resource owner, which we
+ * do not want for reasons explained at the head of this file. Instead
+ * just shove the CatalogSnapshot into the pairing heap manually. This
+ * has to be reversed in InvalidateCatalogSnapshot, of course.
+ *
+ * NB: it had better be impossible for this to throw error, since the
+ * CatalogSnapshot pointer is already valid.
*/
- CatalogSnapshotStale = false;
+ pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node);
}
return CatalogSnapshot;
*
* This code is released under the terms of the PostgreSQL License.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/bin/initdb/initdb.c
static void setup_config(void);
static void bootstrap_template1(void);
static void setup_auth(FILE *cmdfd);
- static void get_set_pwd(FILE *cmdfd);
+ static void get_su_pwd(void);
static void setup_depend(FILE *cmdfd);
static void setup_sysviews(FILE *cmdfd);
+#ifdef PGXC
+static void setup_nodeself(FILE *cmdfd);
+#endif
static void setup_description(FILE *cmdfd);
static void setup_collation(FILE *cmdfd);
static void setup_conversion(FILE *cmdfd);
" SET relacl = (SELECT array_agg(a.acl) FROM "
" (SELECT E'=r/\"$POSTGRES_SUPERUSERNAME\"' as acl "
" UNION SELECT unnest(pg_catalog.acldefault("
- " CASE WHEN relkind = 'S' THEN 's' ELSE 'r' END::\"char\",10::oid))"
+ " CASE WHEN relkind = " CppAsString2(RELKIND_SEQUENCE) " THEN 's' "
+ " ELSE 'r' END::\"char\"," CppAsString2(BOOTSTRAP_SUPERUSERID) "::oid))"
" ) as a) "
- " WHERE relkind IN ('r', 'v', 'm', 'S') AND relacl IS NULL;\n\n",
+ " WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", "
+ CppAsString2(RELKIND_VIEW) ", " CppAsString2(RELKIND_MATVIEW) ", "
+ CppAsString2(RELKIND_SEQUENCE) ")"
+ " AND relacl IS NULL;\n\n",
"GRANT USAGE ON SCHEMA pg_catalog TO PUBLIC;\n\n",
"GRANT CREATE, USAGE ON SCHEMA public TO PUBLIC;\n\n",
+#ifdef XCP
+ "GRANT USAGE ON SCHEMA storm_catalog TO PUBLIC;\n",
+#endif
"REVOKE ALL ON pg_largeobject FROM PUBLIC;\n\n",
"INSERT INTO pg_init_privs "
" (objoid, classoid, objsubid, initprivs, privtype)"
{"version", no_argument, NULL, 'V'},
{"debug", no_argument, NULL, 'd'},
{"show", no_argument, NULL, 's'},
- {"noclean", no_argument, NULL, 'n'},
- {"nosync", no_argument, NULL, 'N'},
+ {"noclean", no_argument, NULL, 'n'}, /* for backwards compatibility */
+ {"no-clean", no_argument, NULL, 'n'},
+ {"nosync", no_argument, NULL, 'N'}, /* for backwards compatibility */
+ {"no-sync", no_argument, NULL, 'N'},
{"sync-only", no_argument, NULL, 'S'},
- {"xlogdir", required_argument, NULL, 'X'},
+ {"waldir", required_argument, NULL, 'X'},
{"data-checksums", no_argument, NULL, 'k'},
+#ifdef PGXC
+ {"nodename", required_argument, NULL, 12},
+#endif
{NULL, 0, NULL, 0}
};
if (authwarning != NULL)
fprintf(stderr, "%s", authwarning);
- /* Get directory specification used to start this executable */
- strlcpy(bin_dir, argv[0], sizeof(bin_dir));
- get_parent_directory(bin_dir);
+ /*
+ * Build up a shell command to tell the user how to start the server
+ */
+ start_db_cmd = createPQExpBuffer();
+
+ /* Get directory specification used to start initdb ... */
+ strlcpy(pg_ctl_path, argv[0], sizeof(pg_ctl_path));
+ canonicalize_path(pg_ctl_path);
+ get_parent_directory(pg_ctl_path);
+ /* ... and tag on pg_ctl instead */
+ join_path_components(pg_ctl_path, pg_ctl_path, "pg_ctl");
+
+ /* path to pg_ctl, properly quoted */
+ appendShellString(start_db_cmd, pg_ctl_path);
+
+ /* add -D switch, with properly quoted data directory */
+ appendPQExpBufferStr(start_db_cmd, " -D ");
+ appendShellString(start_db_cmd, pgdata_native);
+
+ /* add suggested -l switch and "start" command */
+ /* translator: This is a placeholder in a shell command. */
+ appendPQExpBuffer(start_db_cmd, " -l %s start", _("logfile"));
- printf(_("You can now start the database server of the Postgres-XL coordinator using:\n\n"
- " %s%s%spostgres%s --coordinator -D %s%s%s\n"
+
+#ifdef PGXC
+ printf(_("\nSuccess.\n"));
+ {
+ char *pgxc_ctl_silent = getenv("PGXC_CTL_SILENT");
+ if (!pgxc_ctl_silent || !strlen(pgxc_ctl_silent))
+ {
- " %s%s%spg_ctl%s start -D %s%s%s -Z coordinator -l logfile\n\n"
++ printf(_("\nSuccess. You can now start the database server of the Postgres-XL coordinator using:\n\n"
++ " %s -Z coordinator\n\n"
+ "or\n"
- " %s%s%spostgres%s --datanode -D %s%s%s\n"
- "or \n"
- " %s%s%spg_ctl%s start -D %s%s%s -Z datanode -l logfile\n\n"),
- QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
- QUOTE_PATH, pgdata_native, QUOTE_PATH,
- QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
- QUOTE_PATH, pgdata_native, QUOTE_PATH,
- QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
- QUOTE_PATH, pgdata_native, QUOTE_PATH,
- QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
- QUOTE_PATH, pgdata_native, QUOTE_PATH);
+ " You can now start the database server of the Postgres-XL datanode using:\n\n"
++ " %s -Z datanode\n\n"),
++ start_db_cmd->data,
++ start_db_cmd->data);
+ }
+ }
+#else
printf(_("\nSuccess. You can now start the database server using:\n\n"
- " %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"),
- QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH,
- QUOTE_PATH, pgdata_native, QUOTE_PATH);
+ " %s\n\n"),
+ start_db_cmd->data);
+#endif
+ destroyPQExpBuffer(start_db_cmd);
+
return 0;
}
{
printf(_("%s is a utility to initialize, start, stop, or control a PostgreSQL server.\n\n"), progname);
printf(_("Usage:\n"));
- printf(_(" %s init[db] [-D DATADIR] [-s] [-o \"OPTIONS\"]\n"), progname);
- #ifdef PGXC
- printf(_(" %s start [-w] [-t SECS] [-Z NODE-TYPE] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
- printf(_(" %s restart [-w] [-t SECS] [-Z NODE-TYPE] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
- " [-o \"OPTIONS\"]\n"), progname);
- #else
- printf(_(" %s start [-w] [-t SECS] [-D DATADIR] [-s] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
- printf(_(" %s restart [-w] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"
- " [-o \"OPTIONS\"]\n"), progname);
- #endif
- printf(_(" %s stop [-W] [-t SECS] [-D DATADIR] [-s] [-m SHUTDOWN-MODE]\n"), progname);
- printf(_(" %s reload [-D DATADIR] [-s]\n"), progname);
- printf(_(" %s status [-D DATADIR]\n"), progname);
- printf(_(" %s promote [-D DATADIR] [-s]\n"), progname);
- printf(_(" %s kill SIGNALNAME PID\n"), progname);
+ printf(_(" %s init[db] [-D DATADIR] [-s] [-o OPTIONS]\n"), progname);
- printf(_(" %s start [-D DATADIR] [-l FILENAME] [-W] [-t SECS] [-s]\n"
++ printf(_(" %s start [-D DATADIR] [-Z NODE-TYPE] [-l FILENAME] [-W] [-t SECS] [-s]\n"
+ " [-o OPTIONS] [-p PATH] [-c]\n"), progname);
+ printf(_(" %s stop [-D DATADIR] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"), progname);
- printf(_(" %s restart [-D DATADIR] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"
++ printf(_(" %s restart [-D DATADIR] [-Z NODE-TYPE] [-m SHUTDOWN-MODE] [-W] [-t SECS] [-s]\n"
+ " [-o OPTIONS] [-c]\n"), progname);
+ printf(_(" %s reload [-D DATADIR] [-s]\n"), progname);
+ printf(_(" %s status [-D DATADIR]\n"), progname);
+ printf(_(" %s promote [-D DATADIR] [-W] [-t SECS] [-s]\n"), progname);
+ printf(_(" %s kill SIGNALNAME PID\n"), progname);
#ifdef WIN32
- printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
- " [-S START-TYPE] [-w] [-t SECS] [-o \"OPTIONS\"]\n"), progname);
+ printf(_(" %s register [-D DATADIR] [-N SERVICENAME] [-U USERNAME] [-P PASSWORD]\n"
+ " [-S START-TYPE] [-e SOURCE] [-W] [-t SECS] [-s] [-o OPTIONS]\n"), progname);
printf(_(" %s unregister [-N SERVICENAME]\n"), progname);
#endif
printf(_(" -s, --silent only print errors, no informational messages\n"));
printf(_(" -t, --timeout=SECS seconds to wait when using -w option\n"));
printf(_(" -V, --version output version information, then exit\n"));
- printf(_(" -w wait until operation completes\n"));
- printf(_(" -W do not wait until operation completes\n"));
- #ifdef PGXC
+ printf(_(" -Z NODE-TYPE can be \"coordinator\" or \"datanode\" (Postgres-XL)\n"));
- #endif
+ printf(_(" -w, --wait wait until operation completes (default)\n"));
+ printf(_(" -W, --no-wait do not wait until operation completes\n"));
printf(_(" -?, --help show this help, then exit\n"));
- printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n"));
printf(_("If the -D option is omitted, the environment variable PGDATA is used.\n"));
printf(_("\nOptions for start or restart:\n"));
/* process command-line options */
while (optind < argc)
{
- #ifdef PGXC
- while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wWZ:", long_options, &option_index)) != -1)
- #else
- while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wW", long_options, &option_index)) != -1)
- #endif
- while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wW",
++ while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wWZ:",
+ long_options, &option_index)) != -1)
{
switch (c)
{
* pg_dump is a utility for dumping out a postgres database
* into a script file.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* pg_dump will read the system catalogs in a database and dump out a
{"no-security-labels", no_argument, &dopt.no_security_labels, 1},
{"no-synchronized-snapshots", no_argument, &dopt.no_synchronized_snapshots, 1},
{"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1},
- #ifdef PGXC
+ {"no-subscriptions", no_argument, &dopt.no_subscriptions, 1},
+ {"no-sync", no_argument, NULL, 7},
-
+ {"include-nodes", no_argument, &include_nodes, 1},
- #endif
-
{NULL, 0, NULL, 0}
};
else
ExecuteSqlStatement(AH,
"SET TRANSACTION ISOLATION LEVEL "
- "REPEATABLE READ, READ ONLY");
+ "REPEATABLE READ"
+#ifndef XCP
+ ", READ ONLY"
+#endif
+ );
}
- else if (AH->remoteVersion >= 70400)
+ else
{
- /* note: comma was not accepted in SET TRANSACTION before 8.0 */
ExecuteSqlStatement(AH,
"SET TRANSACTION ISOLATION LEVEL "
- "SERIALIZABLE READ ONLY");
+ "SERIALIZABLE, READ ONLY");
}
- else
- ExecuteSqlStatement(AH,
- "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE");
/*
* If user specified a snapshot to use, select that. In a parallel dump
* initdb time, see pg_init_privs).
*/
nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_ACL;
+ }
else if (strncmp(nsinfo->dobj.name, "pg_", 3) == 0 ||
+#ifdef XCP
+ strncmp(nsinfo->dobj.name, "storm_", 6) == 0 ||
+#endif
strcmp(nsinfo->dobj.name, "information_schema") == 0)
+ {
+ /* Other system schemas don't get dumped */
nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_NONE;
+ }
else
nsinfo->dobj.dump_contains = nsinfo->dobj.dump = DUMP_COMPONENT_ALL;
initacl_subquery->data,
initracl_subquery->data,
username_subquery,
+ fout->isPostgresXL
+ ? "(SELECT pclocatortype from pgxc_class v where v.pcrelid = c.oid) AS pgxclocatortype,"
+ "(SELECT pcattnum from pgxc_class v where v.pcrelid = c.oid) AS pgxcattnum,"
+ "(SELECT string_agg(node_name,',') AS pgxc_node_names from pgxc_node n where n.oid in (select unnest(nodeoids) from pgxc_class v where v.pcrelid=c.oid) ) , "
+ : "",
+ RELKIND_SEQUENCE,
attacl_subquery->data,
attracl_subquery->data,
attinitacl_subquery->data,
"d.refobjid AS owning_tab, "
"d.refobjsubid AS owning_col, "
"(SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, "
+#ifdef PGXC
+ "%s"
+#endif
"c.reloptions AS reloptions, "
"tc.reloptions AS toast_reloptions, "
- "NULL AS changed_acl "
+ "NULL AS changed_acl, "
+ "NULL AS partkeydef, "
+ "false AS ispartition, "
+ "NULL AS partbound "
"FROM pg_class c "
"LEFT JOIN pg_depend d ON "
"(c.relkind = '%c' AND "
int relpages; /* table's size in pages (from pg_class) */
bool interesting; /* true if need to collect more data */
+ bool dummy_view; /* view's real definition must be postponed */
bool postponed_def; /* matview must be postponed into post-data */
+ bool ispartition; /* is table a partition? */
+#ifdef PGXC
+ /* PGXC table locator Data */
+ char pgxclocatortype; /* Type of PGXC table locator */
+ int pgxcattnum; /* Number of the attribute the table is partitioned with */
+ char *pgxc_node_names; /* List of node names where this table is distributed */
+#endif
/*
* These fields are computed only if we decide the table is interesting
* (it's either a table to dump, or a direct parent of a dumpable table).
{"quote-all-identifiers", no_argument, "e_all_identifiers, 1},
{"role", required_argument, NULL, 3},
{"use-set-session-authorization", no_argument, &use_setsessauth, 1},
+ {"no-publications", no_argument, &no_publications, 1},
{"no-security-labels", no_argument, &no_security_labels, 1},
+ {"no-subscriptions", no_argument, &no_subscriptions, 1},
+ {"no-sync", no_argument, NULL, 4},
{"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1},
-
+ {"no-role-passwords", no_argument, &no_role_passwords, 1},
+#ifdef PGXC
+ {"dump-nodes", no_argument, &dump_nodes, 1},
+ {"include-nodes", no_argument, &include_nodes, 1},
+#endif
{NULL, 0, NULL, 0}
};
#include "storage/standbydefs.h"
#include "utils/relmapper.h"
- #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
+#ifdef XCP
+#include "pgxc/barrier.h"
+#endif
+
+ #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
{ name, desc, identify},
const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
}
/* For version match, only print psql banner on startup. */
else if (in_startup)
+#ifdef PGXC
+ printf("%s (PGXL %s, based on PG %s)\n", pset.progname, PGXC_VERSION, PG_VERSION);
+#else
printf("%s (%s)\n", pset.progname, PG_VERSION);
+#endif
if (pset.sversion / 100 > client_ver / 100)
- printf(_("WARNING: %s major version %d.%d, server major version %d.%d.\n"
+ printf(_("WARNING: %s major version %s, server major version %s.\n"
" Some psql features might not work.\n"),
- pset.progname, client_ver / 10000, (client_ver / 100) % 100,
- pset.sversion / 10000, (pset.sversion / 100) % 100);
+ pset.progname,
+ formatPGVersionNumber(client_ver, false,
+ cverbuf, sizeof(cverbuf)),
+ formatPGVersionNumber(pset.sversion, false,
+ sverbuf, sizeof(sverbuf)));
#ifdef WIN32
checkWin32Codepage();
#define THING_NO_CREATE (1 << 0) /* should not show up after CREATE */
#define THING_NO_DROP (1 << 1) /* should not show up after DROP */
- #define THING_NO_SHOW (THING_NO_CREATE | THING_NO_DROP)
+ #define THING_NO_ALTER (1 << 2) /* should not show up after ALTER */
+ #define THING_NO_SHOW (THING_NO_CREATE | THING_NO_DROP | THING_NO_ALTER)
static const pgsql_thing_t words_after_create[] = {
- {"ACCESS METHOD", NULL, NULL},
+ {"ACCESS METHOD", NULL, NULL, THING_NO_ALTER},
{"AGGREGATE", NULL, &Query_for_list_of_aggregates},
+ {"BARRIER", NULL, NULL}, /* Comes barrier name next, so skip it */
{"CAST", NULL, NULL}, /* Casts have complex structures for names, so
* skip it */
{"COLLATION", "SELECT pg_catalog.quote_ident(collname) FROM pg_catalog.pg_collation WHERE collencoding IN (-1, pg_catalog.pg_char_to_encoding(pg_catalog.getdatabaseencoding())) AND substring(pg_catalog.quote_ident(collname),1,%d)='%s'"},
{"DOMAIN", NULL, &Query_for_list_of_domains},
{"EVENT TRIGGER", NULL, NULL},
{"EXTENSION", Query_for_list_of_extensions},
- {"FOREIGN DATA WRAPPER", NULL, NULL},
- {"FOREIGN TABLE", NULL, NULL},
{"FUNCTION", NULL, &Query_for_list_of_functions},
{"GROUP", Query_for_list_of_roles},
- {"LANGUAGE", Query_for_list_of_languages},
{"INDEX", NULL, &Query_for_list_of_indexes},
+ {"LANGUAGE", Query_for_list_of_languages},
+ {"LARGE OBJECT", NULL, NULL, THING_NO_CREATE | THING_NO_DROP},
+ {"NODE", Query_for_list_of_available_nodenames},
+ {"NODE GROUP", Query_for_list_of_available_nodegroup_names},
{"MATERIALIZED VIEW", NULL, &Query_for_list_of_matviews},
{"OPERATOR", NULL, NULL}, /* Querying for this is probably not such a
* good idea. */
{"RULE", "SELECT pg_catalog.quote_ident(rulename) FROM pg_catalog.pg_rules WHERE substring(pg_catalog.quote_ident(rulename),1,%d)='%s'"},
{"SCHEMA", Query_for_list_of_schemas},
{"SEQUENCE", NULL, &Query_for_list_of_sequences},
+ {"SERVER", Query_for_list_of_servers},
+ {"STATISTICS", NULL, &Query_for_list_of_statistics},
+ {"SUBSCRIPTION", Query_for_list_of_subscriptions},
+ {"SYSTEM", NULL, NULL, THING_NO_CREATE | THING_NO_DROP},
{"TABLE", NULL, &Query_for_list_of_tables},
{"TABLESPACE", Query_for_list_of_tablespaces},
- {"TEMP", NULL, NULL, THING_NO_DROP}, /* for CREATE TEMP TABLE ... */
+ {"TEMP", NULL, NULL, THING_NO_DROP | THING_NO_ALTER}, /* for CREATE TEMP TABLE
+ * ... */
{"TEMPLATE", Query_for_list_of_ts_templates, NULL, THING_NO_SHOW},
+ {"TEMPORARY", NULL, NULL, THING_NO_DROP | THING_NO_ALTER}, /* for CREATE TEMPORARY
+ * TABLE ... */
{"TEXT SEARCH", NULL, NULL},
+ {"TRANSFORM", NULL, NULL},
+ {"TRIGGER", "SELECT pg_catalog.quote_ident(tgname) FROM pg_catalog.pg_trigger WHERE substring(pg_catalog.quote_ident(tgname),1,%d)='%s' AND NOT tgisinternal"},
{"TYPE", NULL, &Query_for_list_of_datatypes},
- {"UNIQUE", NULL, NULL, THING_NO_DROP}, /* for CREATE UNIQUE INDEX ... */
- {"UNLOGGED", NULL, NULL, THING_NO_DROP}, /* for CREATE UNLOGGED TABLE
- * ... */
+ {"UNIQUE", NULL, NULL, THING_NO_DROP | THING_NO_ALTER}, /* for CREATE UNIQUE
+ * INDEX ... */
+ {"UNLOGGED", NULL, NULL, THING_NO_DROP | THING_NO_ALTER}, /* for CREATE UNLOGGED
+ * TABLE ... */
{"USER", Query_for_list_of_roles},
- {"USER MAPPING FOR", NULL, NULL},
{"VIEW", NULL, &Query_for_list_of_views},
{NULL} /* end of list */
};
else
COMPLETE_WITH_FUNCTION_ARG(prev2_wd);
}
+
+ /* ALTER NODE */
+ else if (Matches2("ALTER", "NODE"))
+ COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);
+ else if (Matches2("ALTER", "NODE"))
+ COMPLETE_WITH_CONST("WITH");
+ else if (Matches3("ALTER", "NODE", "WITH"))
+ COMPLETE_WITH_CONST("(");
+ else if (Matches3("ALTER", "NODE", "WITH"))
+
+ COMPLETE_WITH_LIST5("TYPE", "HOST", "PORT", "PRIMARY", "PREFERRED");
+
+ /* ALTER PUBLICATION <name> */
+ else if (Matches3("ALTER", "PUBLICATION", MatchAny))
+ {
+ COMPLETE_WITH_LIST5("ADD TABLE", "DROP TABLE", "OWNER TO", "RENAME TO", "SET");
+ }
+ /* ALTER PUBLICATION <name> SET */
+ else if (Matches4("ALTER", "PUBLICATION", MatchAny, "SET"))
+ {
+ COMPLETE_WITH_LIST2("(", "TABLE");
+ }
+ /* ALTER PUBLICATION <name> SET ( */
+ else if (HeadMatches3("ALTER", "PUBLICATION", MatchAny) && TailMatches2("SET", "("))
+ {
+ COMPLETE_WITH_CONST("publish");
+ }
+ /* ALTER SUBSCRIPTION <name> */
+ else if (Matches3("ALTER", "SUBSCRIPTION", MatchAny))
+ {
+ COMPLETE_WITH_LIST7("CONNECTION", "ENABLE", "DISABLE", "OWNER TO",
+ "RENAME TO", "REFRESH PUBLICATION", "SET");
+ }
+ /* ALTER SUBSCRIPTION <name> REFRESH PUBLICATION */
+ else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) &&
+ TailMatches2("REFRESH", "PUBLICATION"))
+ {
+ COMPLETE_WITH_CONST("WITH (");
+ }
+ /* ALTER SUBSCRIPTION <name> REFRESH PUBLICATION WITH ( */
+ else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) &&
+ TailMatches4("REFRESH", "PUBLICATION", "WITH", "("))
+ {
+ COMPLETE_WITH_CONST("copy_data");
+ }
+ /* ALTER SUBSCRIPTION <name> SET */
+ else if (Matches4("ALTER", "SUBSCRIPTION", MatchAny, "SET"))
+ {
+ COMPLETE_WITH_LIST2("(", "PUBLICATION");
+ }
+ /* ALTER SUBSCRIPTION <name> SET ( */
+ else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) && TailMatches2("SET", "("))
+ {
+ COMPLETE_WITH_LIST2("slot_name", "synchronous_commit");
+ }
+ /* ALTER SUBSCRIPTION <name> SET PUBLICATION */
+ else if (HeadMatches3("ALTER", "SUBSCRIPTION", MatchAny) && TailMatches2("SET", "PUBLICATION"))
+ {
+ /* complete with nothing here as this refers to remote publications */
+ }
/* ALTER SCHEMA <name> */
else if (Matches3("ALTER", "SCHEMA", MatchAny))
COMPLETE_WITH_LIST2("OWNER TO", "RENAME TO");
else if (Matches3("DROP", "OWNED", "BY"))
COMPLETE_WITH_QUERY(Query_for_list_of_roles);
+ /* DROP NODE */
+ else if (Matches2("DROP", "NODE"))
+ COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames); /* Should test this code if complesion is not confused with DROP NODE GROUP */
+
+ /* DROP NODE GROUP */
+ else if (Matches3("DROP", "NODE", "GROUP"))
+ COMPLETE_WITH_QUERY(Query_for_list_of_available_nodegroup_names);
+
+ /* EXECUTE DIRECT */
+ else if (Matches2("EXECUTE", "DIRECT"))
+ COMPLETE_WITH_CONST("ON");
+ else if (Matches3("EXECUTE", "DIRECT", "ON"))
+ COMPLETE_WITH_QUERY(Query_for_list_of_available_nodenames);
+
+ /* DROP TEXT SEARCH */
else if (Matches3("DROP", "TEXT", "SEARCH"))
COMPLETE_WITH_LIST4("CONFIGURATION", "DICTIONARY", "PARSER", "TEMPLATE");
Datum *index_values, bool *index_isnull);
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
+ extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
+ extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
+ extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
+ uint32 lowmask, uint32 maxbucket);
+ extern void _hash_kill_items(IndexScanDesc scan);
/* hash.c */
- extern void hash_redo(XLogReaderState *record);
- extern void hash_desc(StringInfo buf, XLogReaderState *record);
- extern const char *hash_identify(uint8 info);
+ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
+ Buffer bucket_buf, BlockNumber bucket_blkno,
+ BufferAccessStrategy bstrategy,
+ uint32 maxbucket, uint32 highmask, uint32 lowmask,
+ double *tuples_removed, double *num_index_tuples,
+ bool bucket_has_garbage,
+ IndexBulkDeleteCallback callback, void *callback_state);
+#ifdef PGXC
+extern Datum compute_hash(Oid type, Datum value, char locator);
+extern char *get_compute_hash_function(Oid type, char locator);
+#endif
+
#endif /* HASH_H */
* POSTGRES heap tuple definitions.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/htup.h
*/
/* symbol name, textual name, redo, desc, identify, startup, cleanup */
- PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL)
- PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL)
- PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL)
- PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL)
- PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL)
- PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL)
- PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL)
- PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL)
- PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL)
- PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL)
- PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL)
- PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL)
- PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL)
- PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup)
- PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
- PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
- PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
- PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)
- PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL)
- PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL)
+ PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
+ PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
+ PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+ PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
+ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
+ PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
+ PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
+ PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
+ PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
+ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
+ PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
+#ifdef PGXC
- PG_RMGR(RM_BARRIER_ID, "Barrier", barrier_redo, barrier_desc, NULL, NULL, NULL)
++PG_RMGR(RM_BARRIER_ID, "Barrier", barrier_redo, barrier_desc, barrier_identify, NULL, NULL, NULL)
+#endif
- PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL)
- PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL)
+ PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
+ PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
* postgres transaction access method support code
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/access/transam.h
*
* postgres transaction system definitions
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/access/xact.h
*
RECOVERY_TARGET_XID,
RECOVERY_TARGET_TIME,
RECOVERY_TARGET_NAME,
+#ifdef PGXC
+ RECOVERY_TARGET_BARRIER,
+#endif
+ RECOVERY_TARGET_LSN,
RECOVERY_TARGET_IMMEDIATE
} RecoveryTargetType;
* include file for the bootstrapping code
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/bootstrap/bootstrap.h
*
* prototypes for functions in backend/catalog/catalog.c
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/catalog/catalog.h
* Routines to support inter-object dependencies.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/catalog/dependency.h
*
extern void performMultipleDeletions(const ObjectAddresses *objects,
DropBehavior behavior, int flags);
-
- extern void deleteWhatDependsOn(const ObjectAddress *object,
- bool showNotices);
-
+#ifdef PGXC
+extern void performRename(const ObjectAddress *object,
+ const char *oldname,
+ const char *newname);
+#endif
extern void recordDependencyOnExpr(const ObjectAddress *depender,
Node *expr, List *rtable,
DependencyType behavior);
* prototypes for functions in backend/catalog/heap.c
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/catalog/heap.h
*
List *containing_rowtypes,
bool allow_system_table_mods);
+#ifdef PGXC
+/* Functions related to distribution data of relations */
+extern void AddRelationDistribution(Oid relid,
+ DistributeBy *distributeby,
+ PGXCSubCluster *subcluster,
+ List *parentOids,
+ TupleDesc descriptor);
+extern void GetRelationDistributionItems(Oid relid,
+ DistributeBy *distributeby,
+ TupleDesc descriptor,
+ char *locatortype,
+ int *hashalgorithm,
+ int *hashbuckets,
+ AttrNumber *attnum);
+extern Oid *GetRelationDistributionNodes(PGXCSubCluster *subcluster,
+ int *numnodes);
+extern Oid *BuildRelationDistributionNodes(List *nodes, int *numnodes);
+extern Oid *SortRelationDistributionNodes(Oid *nodeoids, int numnodes);
+#endif
+ /* pg_partitioned_table catalog manipulation functions */
+ extern void StorePartitionKey(Relation rel,
+ char strategy,
+ int16 partnatts,
+ AttrNumber *partattrs,
+ List *partexprs,
+ Oid *partopclass,
+ Oid *partcollation);
+ extern void RemovePartitionKeyByRelId(Oid relid);
+ extern void StorePartitionBound(Relation rel, Relation parent,
+ PartitionBoundSpec *bound);
#endif /* HEAP_H */
* on system catalogs
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/catalog/indexing.h
*
* prototypes for functions in backend/catalog/namespace.c
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/catalog/namespace.h
* along with the relation's initial contents.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/catalog/pg_namespace.h
* definition of the system "procedure" relation (pg_proc)
* along with the relation's initial contents.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/catalog/pg_proc.h
DATA(insert OID = 6014 ( pg_show_replication_origin_status PGNSP PGUID 12 1 100 0 0 f f f f f t v r 0 0 2249 "" "{26,25,3220,3220}" "{o,o,o,o}" "{local_id, external_id, remote_lsn, local_lsn}" _null_ _null_ pg_show_replication_origin_status _null_ _null_ _null_ ));
DESCR("get progress for all replication origins");
+#ifdef USE_MODULE_MSGIDS
+DATA(insert OID = 6015 ( pg_msgmodule_set PGNSP PGUID 12 1 1 0 0 f f f f t t i s 4 0 16 "20 20 20 25" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_set _null_ _null_ _null_ ));
+DESCR("set debugging level for module/file/msg");
+DATA(insert OID = 6016 ( pg_msgmodule_change PGNSP PGUID 12 1 1 0 0 f f f f t t i s 4 0 16 "20 20 20 20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_change _null_ _null_ _null_ ));
+DESCR("change debugging level for module/file/msg");
+DATA(insert OID = 6017 ( pg_msgmodule_enable PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_enable _null_ _null_ _null_ ));
+DESCR("pid to honour overriden log levels");
+DATA(insert OID = 6018 ( pg_msgmodule_disable PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "20" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_disable _null_ _null_ _null_ ));
+DESCR("pid to ignore overriden log levels");
+DATA(insert OID = 6019 ( pg_msgmodule_enable_all PGNSP PGUID 12 1 1 0 0 f f f f t t i s 1 0 16 "16" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_enable_all _null_ _null_ _null_ ));
+DESCR("all current/future processes to honour overriden log levels");
+DATA(insert OID = 6020 ( pg_msgmodule_disable_all PGNSP PGUID 12 1 1 0 0 f f f f t t i s 0 0 16 "" _null_ _null_ _null_ _null_ _null_ pg_msgmodule_disable_all _null_ _null_ _null_ ));
+DESCR("all processes to ignore overriden log levels");
+#endif
+ /* publications */
+ DATA(insert OID = 6119 ( pg_get_publication_tables PGNSP PGUID 12 1 1000 0 0 f f t f t t s s 1 0 26 "25" "{25,26}" "{i,o}" "{pubname,relid}" _null_ _null_ pg_get_publication_tables _null_ _null_ _null_ ));
+ DESCR("get OIDs of tables in a publication");
/* rls */
DATA(insert OID = 3298 ( row_security_active PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 16 "26" _null_ _null_ _null_ _null_ _null_ row_security_active _null_ _null_ _null_ ));
* along with the relation's initial contents.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/catalog/pg_type.h
TimestampTz prepare_time; /* the time when the stmt was prepared */
} PreparedStatement;
+#ifdef PGXC
+typedef struct
+{
+ /* dynahash.c requires key to be first field */
+ char stmt_name[NAMEDATALEN];
+ int number_of_nodes; /* number of nodes where statement is active */
+ int dns_node_indices[0]; /* node ids where statement is active */
+} DatanodeStatement;
+#endif
/* Utility statements PREPARE, EXECUTE, DEALLOCATE, EXPLAIN EXECUTE */
- extern void PrepareQuery(PrepareStmt *stmt, const char *queryString);
+ extern void PrepareQuery(PrepareStmt *stmt, const char *queryString,
+ int stmt_location, int stmt_len);
extern void ExecuteQuery(ExecuteStmt *stmt, IntoClause *intoClause,
const char *queryString, ParamListInfo params,
DestReceiver *dest, char *completionTag);
#include "catalog/objectaddress.h"
#include "nodes/parsenodes.h"
- #ifdef PGXC
extern Oid CreateSchemaCommand(CreateSchemaStmt *parsetree,
- const char *queryString, bool is_top_level);
- #else
- extern Oid CreateSchemaCommand(CreateSchemaStmt *parsetree,
- const char *queryString);
- #endif
+ const char *queryString,
++ bool is_top_level,
+ int stmt_location, int stmt_len);
+
extern void RemoveSchemaById(Oid schemaOid);
extern ObjectAddress RenameSchema(const char *oldname, const char *newname);
* sequence.h
* prototypes for sequence.c.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/commands/sequence.h
#include "fmgr.h"
#include "lib/stringinfo.h"
#include "nodes/parsenodes.h"
+ #include "parser/parse_node.h"
#include "storage/relfilenode.h"
+#ifdef PGXC
+#include "utils/relcache.h"
+#include "gtm/gtm_c.h"
+#include "access/xact.h"
+#endif
- typedef struct FormData_pg_sequence
+ typedef struct FormData_pg_sequence_data
{
- NameData sequence_name;
int64 last_value;
- int64 start_value;
- int64 increment_by;
- int64 max_value;
- int64 min_value;
- int64 cache_value;
int64 log_cnt;
- bool is_cycled;
bool is_called;
- } FormData_pg_sequence;
+ } FormData_pg_sequence_data;
- typedef FormData_pg_sequence *Form_pg_sequence;
+ typedef FormData_pg_sequence_data *Form_pg_sequence_data;
/*
* Columns of a sequence relation
extern void seq_redo(XLogReaderState *rptr);
extern void seq_desc(StringInfo buf, XLogReaderState *rptr);
extern const char *seq_identify(uint8 info);
+ extern void seq_mask(char *pagedata, BlockNumber blkno);
+#ifdef XCP
+#define DEFAULT_CACHEVAL 1
+extern int SequenceRangeVal;
+#endif
+#ifdef PGXC
+/*
+ * List of actions that registered the callback.
+ * This is listed here and not in sequence.c because callback can also
+ * be registered in dependency.c and tablecmds.c as sequences can be dropped
+ * or renamed in cascade.
+ */
+typedef enum
+{
+ GTM_CREATE_SEQ,
+ GTM_DROP_SEQ
+} GTM_SequenceDropType;
+
+extern bool IsTempSequence(Oid relid);
+extern char *GetGlobalSeqName(Relation rel, const char *new_seqname, const char *new_schemaname);
+#endif
+
#endif /* SEQUENCE_H */
* header file for postgres vacuum cleaner and statistics analyzer
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/commands/vacuum.h
* variable.h
* Routines for handling specialized SET variables.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/commands/variable.h
* and related modules.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/executor/execdesc.h
EState *estate; /* executor's query-wide state */
PlanState *planstate; /* tree of per-plan-node state */
+#ifdef XCP
+ SharedQueue squeue; /* the shared memory queue to sent data to other
+ * nodes */
+ int myindex; /* -1 if locally executed subplan is producing
+ * data and distribute via squeue. Otherwise
+ * get local data from squeue */
+#endif
+ /* This field is set by ExecutorRun */
+ bool already_executed; /* true if previously executed */
/* This is always set NULL by the core system, but plugins can change it */
struct Instrumentation *totaltime; /* total time spent in ExecutorRun */
* support for the POSTGRES executor module
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/executor/executor.h
#define EXEC_FLAG_WITH_OIDS 0x0020 /* force OIDs in returned tuples */
#define EXEC_FLAG_WITHOUT_OIDS 0x0040 /* force no OIDs in returned tuples */
#define EXEC_FLAG_WITH_NO_DATA 0x0080 /* rel scannability doesn't matter */
+#ifdef XCP
+/* distributed executor may never execute the plan on this node */
+#define EXEC_FLAG_SUBPLAN 0x0100
+#endif
- /*
- * ExecEvalExpr was formerly a function containing a switch statement;
- * now it's just a macro invoking the function pointed to by an ExprState
- * node. Beware of double evaluation of the ExprState argument!
- */
- #define ExecEvalExpr(expr, econtext, isNull, isDone) \
- ((*(expr)->evalfunc) (expr, econtext, isNull, isDone))
-
-
/* Hook for plugins to get control in ExecutorStart() */
typedef void (*ExecutorStart_hook_type) (QueryDesc *queryDesc, int eflags);
extern PGDLLIMPORT ExecutorStart_hook_type ExecutorStart_hook;
* tuple table support stuff
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/executor/tuptable.h
* Over time, this has also become the preferred place for widely known
* resource-limitation stuff, such as work_mem and check_stack_depth().
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/miscadmin.h
* definitions for executor state nodes
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/nodes/execnodes.h
#include "utils/reltrigger.h"
#include "utils/sortsupport.h"
#include "utils/tuplestore.h"
+#include "pgxc/squeue.h"
#include "utils/tuplesort.h"
+ #include "nodes/tidbitmap.h"
+ #include "storage/condition_variable.h"
+
+
+ /* ----------------
+ * ExprState node
+ *
+ * ExprState is the top-level node for expression evaluation.
+ * It contains instructions (in ->steps) to evaluate the expression.
+ * ----------------
+ */
+ struct ExprState; /* forward references in this file */
+ struct ExprContext;
+ struct ExprEvalStep; /* avoid including execExpr.h everywhere */
+
+ typedef Datum (*ExprStateEvalFunc) (struct ExprState *expression,
+ struct ExprContext *econtext,
+ bool *isNull);
+
+ /* Bits in ExprState->flags (see also execExpr.h for private flag bits): */
+ /* expression is for use with ExecQual() */
+ #define EEO_FLAG_IS_QUAL (1 << 0)
+
+ typedef struct ExprState
+ {
+ Node tag;
+
+ uint8 flags; /* bitmask of EEO_FLAG_* bits, see above */
+
+ /*
+ * Storage for result value of a scalar expression, or for individual
+ * column results within expressions built by ExecBuildProjectionInfo().
+ */
+ bool resnull;
+ Datum resvalue;
+
+ /*
+ * If projecting a tuple result, this slot holds the result; else NULL.
+ */
+ TupleTableSlot *resultslot;
+
+ /*
+ * Instructions to compute expression's return value.
+ */
+ struct ExprEvalStep *steps;
+
+ /*
+ * Function that actually evaluates the expression. This can be set to
+ * different values depending on the complexity of the expression.
+ */
+ ExprStateEvalFunc evalfunc;
+
+ /* original expression tree, for debugging only */
+ Expr *expr;
+
+ /*
+ * XXX: following only needed during "compilation", could be thrown away.
+ */
+
+ int steps_len; /* number of steps currently */
+ int steps_alloc; /* allocated length of steps array */
+
+ Datum *innermost_caseval;
+ bool *innermost_casenull;
+
+ Datum *innermost_domainval;
+ bool *innermost_domainnull;
+ } ExprState;
/* ----------------
ResultRelInfo *es_result_relations; /* array of ResultRelInfos */
int es_num_result_relations; /* length of array */
ResultRelInfo *es_result_relation_info; /* currently active array elt */
+#ifdef PGXC
+#ifndef PGXC
+ struct PlanState *es_result_remoterel; /* currently active remote rel */
+#endif
+#endif
+ /*
+ * Info about the target partitioned target table root(s) for
+ * update/delete queries. They required only to fire any per-statement
+ * triggers defined on the table. It exists separately from
+ * es_result_relations, because partitioned tables don't appear in the
+ * plan tree for the update/delete cases.
+ */
+ ResultRelInfo *es_root_result_relations; /* array of ResultRelInfos */
+ int es_num_root_result_relations; /* length of the array */
+
/* Stuff used for firing triggers: */
List *es_trig_target_relations; /* trigger-only ResultRelInfos */
TupleTableSlot *es_trig_tuple_slot; /* for trigger output tuples */
* Definitions for tagged nodes.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/nodes/nodes.h
*
T_FromExpr,
T_OnConflictExpr,
T_IntoClause,
+#ifdef PGXC
+ T_DistributeBy,
+ T_PGXCSubCluster,
+#endif
+ T_NextValueExpr,
/*
* TAGS FOR EXPRESSION STATE NODES (execnodes.h)
T_PlaceHolderInfo,
T_MinMaxAggInfo,
T_PlannerParamItem,
+#ifdef XCP
+ T_RemoteSubPath,
+#endif
+ T_RollupData,
+ T_GroupingSetData,
+ T_StatisticExtInfo,
+
/*
* TAGS FOR MEMORY NODES (memnodes.h)
*/
/*
* nodes/{outfuncs.c,print.c}
*/
- extern char *nodeToString(const void *obj);
-
+#ifdef XCP
+extern void set_portable_output(bool value);
+#endif
struct Bitmapset; /* not to include bitmapset.h here */
struct StringInfoData; /* not to include stringinfo.h here */
+
extern void outNode(struct StringInfoData *str, const void *obj);
extern void outToken(struct StringInfoData *str, const char *s);
extern void outBitmapset(struct StringInfoData *str,
* Support for finding the values associated with Param nodes.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/nodes/params.h
* This is a byte (not character) offset in the original source text, to be
* used for positioning an error cursor when there is an error related to
* the node. Access to the original source text is needed to make use of
- * the location.
+ * the location. At the topmost (statement) level, we also provide a
+ * statement length, likewise measured in bytes, for convenience in
+ * identifying statement boundaries in multi-statement source strings.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/nodes/parsenodes.h
*
#include "nodes/lockoptions.h"
#include "nodes/primnodes.h"
#include "nodes/value.h"
+#ifdef PGXC
+#include "access/tupdesc.h"
+#include "pgxc/locator.h"
+#endif
+ typedef enum OverridingKind
+ {
+ OVERRIDING_NOT_SET = 0,
+ OVERRIDING_USER_VALUE,
+ OVERRIDING_SYSTEM_VALUE
+ } OverridingKind;
+
/* Possible sources of a Query */
typedef enum QuerySource
{
RTE_SUBQUERY, /* subquery in FROM */
RTE_JOIN, /* join */
RTE_FUNCTION, /* function in FROM */
+ RTE_TABLEFUNC, /* TableFunc(.., column list) */
RTE_VALUES, /* VALUES (<exprlist>), (<exprlist>), ... */
- RTE_CTE /* common table expr (WITH list element) */
+#ifdef PGXC
- ,RTE_REMOTE_DUMMY /* RTEs created by remote plan reduction */
++ RTE_REMOTE_DUMMY, /* RTEs created by remote plan reduction */
+#endif /* PGXC */
+ RTE_CTE, /* common table expr (WITH list element) */
+ RTE_NAMEDTUPLESTORE /* tuplestore, e.g. for AFTER triggers */
} RTEKind;
typedef struct RangeTblEntry
RangeVar *sequence; /* the sequence to create */
List *options;
Oid ownerId; /* ID of owner, or InvalidOid for default */
+#ifdef PGXC
+ bool is_serial; /* Indicates if this sequence is part of SERIAL process */
+#endif
+ bool for_identity;
bool if_not_exists; /* just do nothing if it already exists? */
} CreateSeqStmt;
NodeTag type;
RangeVar *sequence; /* the sequence to alter */
List *options;
+ bool for_identity;
bool missing_ok; /* skip error if a role is missing? */
+#ifdef PGXC
+ bool is_serial; /* Indicates if this sequence is part of SERIAL process */
+#endif
} AlterSeqStmt;
/* ----------------------
bool missing_ok; /* for DROP - skip error if missing? */
} AlterTSConfigurationStmt;
+/* PGXC_BEGIN */
+/*
+ * EXECUTE DIRECT statement
+ */
+typedef struct ExecDirectStmt
+{
+ NodeTag type;
+ List *node_names;
+ char *query;
+} ExecDirectStmt;
+
+/*
+ * CLEAN CONNECTION statement
+ */
+typedef struct CleanConnStmt
+{
+ NodeTag type;
+ List *nodes; /* list of nodes dropped */
+ char *dbname; /* name of database to drop connections */
+ char *username; /* name of user whose connections are dropped */
+ bool is_coord; /* type of connections dropped */
+ bool is_force; /* option force */
+} CleanConnStmt;
+/* PGXC_END */
+ typedef struct CreatePublicationStmt
+ {
+ NodeTag type;
+ char *pubname; /* Name of of the publication */
+ List *options; /* List of DefElem nodes */
+ List *tables; /* Optional list of tables to add */
+ bool for_all_tables; /* Special publication for all tables in db */
+ } CreatePublicationStmt;
+
+ typedef struct AlterPublicationStmt
+ {
+ NodeTag type;
+ char *pubname; /* Name of of the publication */
+
+ /* parameters used for ALTER PUBLICATION ... WITH */
+ List *options; /* List of DefElem nodes */
+
+ /* parameters used for ALTER PUBLICATION ... ADD/DROP TABLE */
+ List *tables; /* List of tables to add/drop */
+ bool for_all_tables; /* Special publication for all tables in db */
+ DefElemAction tableAction; /* What action to perform with the tables */
+ } AlterPublicationStmt;
+
+ typedef struct CreateSubscriptionStmt
+ {
+ NodeTag type;
+ char *subname; /* Name of of the subscription */
+ char *conninfo; /* Connection string to publisher */
+ List *publication; /* One or more publication to subscribe to */
+ List *options; /* List of DefElem nodes */
+ } CreateSubscriptionStmt;
+
+ typedef enum AlterSubscriptionType
+ {
+ ALTER_SUBSCRIPTION_OPTIONS,
+ ALTER_SUBSCRIPTION_CONNECTION,
+ ALTER_SUBSCRIPTION_PUBLICATION,
+ ALTER_SUBSCRIPTION_PUBLICATION_REFRESH,
+ ALTER_SUBSCRIPTION_REFRESH,
+ ALTER_SUBSCRIPTION_ENABLED
+ } AlterSubscriptionType;
+
+ typedef struct AlterSubscriptionStmt
+ {
+ NodeTag type;
+ AlterSubscriptionType kind; /* ALTER_SUBSCRIPTION_OPTIONS, etc */
+ char *subname; /* Name of of the subscription */
+ char *conninfo; /* Connection string to publisher */
+ List *publication; /* One or more publication to subscribe to */
+ List *options; /* List of DefElem nodes */
+ } AlterSubscriptionStmt;
+
+ typedef struct DropSubscriptionStmt
+ {
+ NodeTag type;
+ char *subname; /* Name of of the subscription */
+ bool missing_ok; /* Skip error if missing? */
+ DropBehavior behavior; /* RESTRICT or CASCADE behavior */
+ } DropSubscriptionStmt;
+
#endif /* PARSENODES_H */
* definitions for query plan nodes
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/nodes/plannodes.h
List *invalItems; /* other dependencies, as PlanInvalItems */
int nParamExec; /* number of PARAM_EXEC Params used */
+#ifdef XCP
+ int nParamRemote; /* number of params sent from the master mode */
+
+ struct RemoteParam *remoteparams;/* parameter descriptors */
+
+ const char *pname; /* the portal name */
+
+ /* Parameters to filter out result rows */
+ char distributionType;
+ AttrNumber distributionKey;
+ List *distributionNodes;
+ List *distributionRestrict;
+#endif
+
+ Node *utilityStmt; /* non-null if this is utility stmt */
+
+ /* statement location in source string (copied from Query) */
+ int stmt_location; /* start location, or -1 if unknown */
+ int stmt_len; /* length in bytes; 0 means "rest of string" */
} PlannedStmt;
/* macro for fetching the Plan associated with a SubPlan node */
* and join trees.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/nodes/primnodes.h
*
* Definitions for planner's internal data structures.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/nodes/relation.h
* prototypes for costsize.c and clausesel.c.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/optimizer/cost.h
extern bool enable_material;
extern bool enable_mergejoin;
extern bool enable_hashjoin;
- #ifdef PGXC
+extern bool enable_fast_query_shipping;
-
- #endif
+ extern bool enable_gathermerge;
extern int constraint_exclusion;
extern double clamp_row_est(double nrows);
RelOptInfo *baserel, ParamPathInfo *param_info);
extern void cost_functionscan(Path *path, PlannerInfo *root,
RelOptInfo *baserel, ParamPathInfo *param_info);
+ extern void cost_tableexprscan(Path *path, PlannerInfo *root,
+ RelOptInfo *baserel, ParamPathInfo *param_info);
extern void cost_valuesscan(Path *path, PlannerInfo *root,
RelOptInfo *baserel, ParamPathInfo *param_info);
+#ifdef PGXC
+extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel);
+#endif
+ extern void cost_tablefuncscan(Path *path, PlannerInfo *root,
+ RelOptInfo *baserel, ParamPathInfo *param_info);
extern void cost_ctescan(Path *path, PlannerInfo *root,
RelOptInfo *baserel, ParamPathInfo *param_info);
+ extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root,
+ RelOptInfo *baserel, ParamPathInfo *param_info);
extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm);
extern void cost_sort(Path *path, PlannerInfo *root,
List *pathkeys, Cost input_cost, double tuples, int width,
* prototypes for pathnode.c, relnode.c.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/optimizer/pathnode.h
extern GatherPath *create_gather_path(PlannerInfo *root,
RelOptInfo *rel, Path *subpath, PathTarget *target,
Relids required_outer, double *rows);
+ extern GatherMergePath *create_gather_merge_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ PathTarget *target,
+ List *pathkeys,
+ Relids required_outer,
+ double *rows);
extern SubqueryScanPath *create_subqueryscan_path(PlannerInfo *root,
- RelOptInfo *rel, Path *subpath,
- List *pathkeys, Relids required_outer);
+ RelOptInfo *rel, Path *subpath, List *pathkeys,
+ Relids required_outer, Distribution *distribution);
extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel,
List *pathkeys, Relids required_outer);
+ extern Path *create_tablexprscan_path(PlannerInfo *root, RelOptInfo *rel,
+ List *pathkeys, Relids required_outer);
extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel,
Relids required_outer);
+ extern Path *create_tablefuncscan_path(PlannerInfo *root, RelOptInfo *rel,
+ Relids required_outer);
extern Path *create_ctescan_path(PlannerInfo *root, RelOptInfo *rel,
Relids required_outer);
+ extern Path *create_namedtuplestorescan_path(PlannerInfo *root, RelOptInfo *rel,
+ Relids required_outer);
extern Path *create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel,
Relids required_outer);
extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
* prototypes for various files in optimizer/plan
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/optimizer/planmain.h
* parse analysis for optimizable statements
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/parser/analyze.h
* by the PG_KEYWORD macro, which is not defined in this file; it can
* be defined by the caller for special purposes.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* IDENTIFICATION
* src/include/parser/kwlist.h
PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD)
PG_KEYWORD("depends", DEPENDS, UNRESERVED_KEYWORD)
PG_KEYWORD("desc", DESC, RESERVED_KEYWORD)
+ PG_KEYWORD("detach", DETACH, UNRESERVED_KEYWORD)
PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD)
+PG_KEYWORD("direct", DIRECT, UNRESERVED_KEYWORD)
PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD)
PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD)
PG_KEYWORD("national", NATIONAL, COL_NAME_KEYWORD)
PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD)
PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD)
+ PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD)
PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD)
PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("node", NODE, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
PG_KEYWORD("not", NOT, RESERVED_KEYWORD)
PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD)
PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD)
PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD)
PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD)
+ PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD)
PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("randomly", RANDOMLY, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD)
PG_KEYWORD("read", READ, UNRESERVED_KEYWORD)
PG_KEYWORD("real", REAL, COL_NAME_KEYWORD)
* parse_agg.h
* handle aggregates and window functions in parser
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/parser/parse_agg.h
extern Oid LookupFuncName(List *funcname, int nargs, const Oid *argtypes,
bool noError);
- extern Oid LookupFuncNameTypeNames(List *funcname, List *argtypes,
- bool noError);
- extern Oid LookupAggNameTypeNames(List *aggname, List *argtypes,
- bool noError);
+ extern Oid LookupFuncWithArgs(ObjectWithArgs *func,
+ bool noError);
+ extern Oid LookupAggWithArgs(ObjectWithArgs *agg,
+ bool noError);
+
+ extern void check_srf_call_placement(ParseState *pstate, int location);
+extern void check_pg_get_expr_args(ParseState *pstate, Oid fnoid, List *args);
#endif /* PARSE_FUNC_H */
* parse analysis for utility commands
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/parser/parse_utilcmd.h
*
extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
List **actions, Node **whereClause);
extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
+#ifdef PGXC
+extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname);
+#endif
+ extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation parent,
+ PartitionBoundSpec *spec);
#endif /* PARSE_UTILCMD_H */
#define MEMSET_LOOP_LIMIT 1024
/* Define to the address where bug reports for this package should be sent. */
/* Define to the full name of this package. */
-#define PACKAGE_NAME "PostgreSQL"
+#define PACKAGE_NAME "Postgres-XL"
/* Define to the full name and version of this package. */
- #define PACKAGE_STRING "Postgres-XL 9.6alpha1"
-#define PACKAGE_STRING "PostgreSQL 10beta1"
++#define PACKAGE_STRING "Postgres-XL 10alpha1"
/* Define to the version of this package. */
- #define PACKAGE_VERSION "9.6alpha1"
+ #define PACKAGE_VERSION "10beta1"
/* Define to the name of a signed 128-bit integer type. */
#undef PG_INT128_TYPE
#define PG_INT64_TYPE long long int
/* PostgreSQL version as a string */
- #define PG_VERSION "9.6beta4"
+ #define PG_VERSION "10beta1"
/* PostgreSQL version as a number */
- #define PG_VERSION_NUM 90600
+ #define PG_VERSION_NUM 100000
/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "postgresql"
+#define PACKAGE_TARNAME "postgres-xl"
+
+/* Postgres-XC version as a string */
+#define PGXC_VERSION "1.1devel"
+
+/* Postgres-XC version as a number */
+#define PGXC_VERSION_NUM 10100
/* Define to the name of the default PostgreSQL service principal in Kerberos.
(--with-krb-srvnam=NAME) */
*
* Definitions for the PostgreSQL statistics collector daemon.
*
- * Copyright (c) 2001-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Copyright (c) 2001-2017, PostgreSQL Global Development Group
*
* src/include/pgstat.h
* ----------
* Wait Classes
* ----------
*/
- typedef enum WaitClass
+ #define PG_WAIT_LWLOCK 0x01000000U
+ #define PG_WAIT_LOCK 0x03000000U
+ #define PG_WAIT_BUFFER_PIN 0x04000000U
+ #define PG_WAIT_ACTIVITY 0x05000000U
+ #define PG_WAIT_CLIENT 0x06000000U
+ #define PG_WAIT_EXTENSION 0x07000000U
+ #define PG_WAIT_IPC 0x08000000U
+ #define PG_WAIT_TIMEOUT 0x09000000U
+ #define PG_WAIT_IO 0x0A000000U
+
+ /* ----------
+ * Wait Events - Activity
+ *
+ * Use this category when a process is waiting because it has no work to do,
+ * unless the "Client" or "Timeout" category describes the situation better.
+ * Typically, this should only be used for background processes.
+ * ----------
+ */
+ typedef enum
{
- WAIT_UNDEFINED,
- WAIT_LWLOCK_NAMED,
- WAIT_LWLOCK_TRANCHE,
- WAIT_LOCK,
- WAIT_BUFFER_PIN
- } WaitClass;
+ WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY,
+ WAIT_EVENT_AUTOVACUUM_MAIN,
+ WAIT_EVENT_BGWRITER_HIBERNATE,
+ WAIT_EVENT_BGWRITER_MAIN,
+ WAIT_EVENT_CHECKPOINTER_MAIN,
+ WAIT_EVENT_PGSTAT_MAIN,
+ WAIT_EVENT_RECOVERY_WAL_ALL,
+ WAIT_EVENT_RECOVERY_WAL_STREAM,
+ WAIT_EVENT_SYSLOGGER_MAIN,
+ WAIT_EVENT_WAL_RECEIVER_MAIN,
+ WAIT_EVENT_WAL_SENDER_MAIN,
+ WAIT_EVENT_WAL_WRITER_MAIN,
+ WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
- WAIT_EVENT_LOGICAL_APPLY_MAIN
++ WAIT_EVENT_LOGICAL_APPLY_MAIN,
++ WAIT_EVENT_CLUSTER_MONITOR_MAIN
+ } WaitEventActivity;
+ /* ----------
+ * Wait Events - Client
+ *
+ * Use this category when a process is waiting to send data to or receive data
+ * from the frontend process to which it is connected. This is never used for
+ * a background process, which has no client connection.
+ * ----------
+ */
+ typedef enum
+ {
+ WAIT_EVENT_CLIENT_READ = PG_WAIT_CLIENT,
+ WAIT_EVENT_CLIENT_WRITE,
+ WAIT_EVENT_SSL_OPEN_SERVER,
+ WAIT_EVENT_WAL_RECEIVER_WAIT_START,
+ WAIT_EVENT_LIBPQWALRECEIVER,
+ WAIT_EVENT_WAL_SENDER_WAIT_WAL,
+ WAIT_EVENT_WAL_SENDER_WRITE_DATA
+ } WaitEventClient;
+
+ /* ----------
+ * Wait Events - IPC
+ *
+ * Use this category when a process cannot complete the work it is doing because
+ * it is waiting for a notification from another process.
+ * ----------
+ */
+ typedef enum
+ {
+ WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC,
+ WAIT_EVENT_BGWORKER_STARTUP,
+ WAIT_EVENT_BTREE_PAGE,
+ WAIT_EVENT_EXECUTE_GATHER,
+ WAIT_EVENT_MQ_INTERNAL,
+ WAIT_EVENT_MQ_PUT_MESSAGE,
+ WAIT_EVENT_MQ_RECEIVE,
+ WAIT_EVENT_MQ_SEND,
+ WAIT_EVENT_PARALLEL_FINISH,
+ WAIT_EVENT_PARALLEL_BITMAP_SCAN,
+ WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
+ WAIT_EVENT_SAFE_SNAPSHOT,
+ WAIT_EVENT_SYNC_REP,
+ WAIT_EVENT_LOGICAL_SYNC_DATA,
+ WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE
+ } WaitEventIPC;
+
+ /* ----------
+ * Wait Events - Timeout
+ *
+ * Use this category when a process is waiting for a timeout to expire.
+ * ----------
+ */
+ typedef enum
+ {
+ WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT,
+ WAIT_EVENT_PG_SLEEP,
+ WAIT_EVENT_RECOVERY_APPLY_DELAY
+ } WaitEventTimeout;
+
+ /* ----------
+ * Wait Events - IO
+ *
+ * Use this category when a process is waiting for a IO.
+ * ----------
+ */
+ typedef enum
+ {
+ WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
+ WAIT_EVENT_BUFFILE_WRITE,
+ WAIT_EVENT_CONTROL_FILE_READ,
+ WAIT_EVENT_CONTROL_FILE_SYNC,
+ WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
+ WAIT_EVENT_CONTROL_FILE_WRITE,
+ WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE,
+ WAIT_EVENT_COPY_FILE_READ,
+ WAIT_EVENT_COPY_FILE_WRITE,
+ WAIT_EVENT_DATA_FILE_EXTEND,
+ WAIT_EVENT_DATA_FILE_FLUSH,
+ WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC,
+ WAIT_EVENT_DATA_FILE_PREFETCH,
+ WAIT_EVENT_DATA_FILE_READ,
+ WAIT_EVENT_DATA_FILE_SYNC,
+ WAIT_EVENT_DATA_FILE_TRUNCATE,
+ WAIT_EVENT_DATA_FILE_WRITE,
+ WAIT_EVENT_DSM_FILL_ZERO_WRITE,
+ WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ,
+ WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC,
+ WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE,
+ WAIT_EVENT_LOCK_FILE_CREATE_READ,
+ WAIT_EVENT_LOCK_FILE_CREATE_SYNC,
+ WAIT_EVENT_LOCK_FILE_CREATE_WRITE,
+ WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ,
+ WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC,
+ WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC,
+ WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE,
+ WAIT_EVENT_LOGICAL_REWRITE_SYNC,
+ WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
+ WAIT_EVENT_LOGICAL_REWRITE_WRITE,
+ WAIT_EVENT_RELATION_MAP_READ,
+ WAIT_EVENT_RELATION_MAP_SYNC,
+ WAIT_EVENT_RELATION_MAP_WRITE,
+ WAIT_EVENT_REORDER_BUFFER_READ,
+ WAIT_EVENT_REORDER_BUFFER_WRITE,
+ WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ,
+ WAIT_EVENT_REPLICATION_SLOT_READ,
+ WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC,
+ WAIT_EVENT_REPLICATION_SLOT_SYNC,
+ WAIT_EVENT_REPLICATION_SLOT_WRITE,
+ WAIT_EVENT_SLRU_FLUSH_SYNC,
+ WAIT_EVENT_SLRU_READ,
+ WAIT_EVENT_SLRU_SYNC,
+ WAIT_EVENT_SLRU_WRITE,
+ WAIT_EVENT_SNAPBUILD_READ,
+ WAIT_EVENT_SNAPBUILD_SYNC,
+ WAIT_EVENT_SNAPBUILD_WRITE,
+ WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC,
+ WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE,
+ WAIT_EVENT_TIMELINE_HISTORY_READ,
+ WAIT_EVENT_TIMELINE_HISTORY_SYNC,
+ WAIT_EVENT_TIMELINE_HISTORY_WRITE,
+ WAIT_EVENT_TWOPHASE_FILE_READ,
+ WAIT_EVENT_TWOPHASE_FILE_SYNC,
+ WAIT_EVENT_TWOPHASE_FILE_WRITE,
+ WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ,
+ WAIT_EVENT_WAL_BOOTSTRAP_SYNC,
+ WAIT_EVENT_WAL_BOOTSTRAP_WRITE,
+ WAIT_EVENT_WAL_COPY_READ,
+ WAIT_EVENT_WAL_COPY_SYNC,
+ WAIT_EVENT_WAL_COPY_WRITE,
+ WAIT_EVENT_WAL_INIT_SYNC,
+ WAIT_EVENT_WAL_INIT_WRITE,
+ WAIT_EVENT_WAL_READ,
+ WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,
+ WAIT_EVENT_WAL_WRITE
+ } WaitEventIO;
/* ----------
* Command type for progress reporting purposes
/* port/pgmkdirp.c */
extern int pg_mkdir_p(char *path, int omode);
+#ifndef PGSIGFUNC
+#define PGSIGFUNC
/* port/pqsignal.c */
typedef void (*pqsigfunc) (int signo);
+#endif
+
extern pqsigfunc pqsignal(int signo, pqsigfunc func);
+ #ifndef WIN32
+ extern pqsigfunc pqsignal_no_restart(int signo, pqsigfunc func);
+ #else
+ #define pqsignal_no_restart(signo, func) pqsignal(signo, func)
+ #endif
/* port/quotes.c */
extern char *escape_single_quotes_ascii(const char *src);
* Client-side code should include postgres_fe.h instead.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1995, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/postgres.h
*
* header file for integrated autovacuum daemon
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/postmaster/autovacuum.h
*
#ifndef AUTOVACUUM_H
#define AUTOVACUUM_H
+ #include "storage/block.h"
+
+ /*
+ * Other processes can request specific work from autovacuum, identified by
+ * AutoVacuumWorkItem elements.
+ */
+ typedef enum
+ {
+ AVW_BRINSummarizeRange
+ } AutoVacuumWorkItemType;
+
+
+#ifdef PGXC /* PGXC_DATANODE */
+#define IsAutoVacuumAnalyzeWorker() (IsAutoVacuumWorkerProcess() && !(MyProc->vacuumFlags & PROC_IN_VACUUM))
+#endif
+
/* GUC variables */
extern bool autovacuum_start_daemon;
extern int autovacuum_max_workers;
* POSTGRES backend id communication definitions
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/backendid.h
* Lightweight lock manager
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/lwlock.h
LWTRANCHE_BUFFER_MAPPING,
LWTRANCHE_LOCK_MANAGER,
LWTRANCHE_PREDICATE_LOCK_MANAGER,
+ LWTRANCHE_SHARED_QUEUES,
+ LWTRANCHE_PARALLEL_QUERY_DSA,
+ LWTRANCHE_TBM,
LWTRANCHE_FIRST_USER_DEFINED
} BuiltinTrancheIds;
* per-process shared memory data structures
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/proc.h
BackendId backendId; /* This backend's backend ID (if assigned) */
Oid databaseId; /* OID of database this backend is using */
Oid roleId; /* OID of role using this backend */
+#ifdef XCP
+ Oid coordId; /* Oid of originating coordinator */
+ int coordPid; /* Pid of the originating session */
+ BackendId firstBackendId; /* Backend ID of the first backend of
+ * the distributed session */
+#endif
+ bool isBackgroundWorker; /* true if background worker. */
+
/*
* While in hot standby mode, shows that a conflict signal has been sent
* for the current transaction. Set/cleared while holding ProcArrayLock,
* Background writer, checkpointer and WAL writer run during normal operation.
* Startup process and WAL receiver also consume 2 slots, but WAL writer is
* launched only after startup has exited, so we only need 4 slots.
+ *
+ * PGXC needs another slot for the pool manager process
*/
+#ifdef PGXC
+#define NUM_AUXILIARY_PROCS 5
+#else
#define NUM_AUXILIARY_PROCS 4
+#endif
-
/* configurable options */
extern int DeadlockTimeout;
extern int StatementTimeout;
* POSTGRES process array definitions.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/storage/procarray.h
*
#include "utils/relcache.h"
#include "utils/snapshot.h"
+#ifdef XCP
+extern int GlobalSnapshotSource;
+
+typedef enum GlobalSnapshotSourceType
+{
+ GLOBAL_SNAPSHOT_SOURCE_GTM,
+ GLOBAL_SNAPSHOT_SOURCE_COORDINATOR
+} GlobalSnapshotSourceType;
+#endif
+ /*
+ * These are to implement PROCARRAY_FLAGS_XXX
+ *
+ * Note: These flags are cloned from PROC_XXX flags in src/include/storage/proc.h
+ * to avoid forcing to include proc.h when including procarray.h. So if you modify
+ * PROC_XXX flags, you need to modify these flags.
+ */
+ #define PROCARRAY_VACUUM_FLAG 0x02 /* currently running
+ * lazy vacuum */
+ #define PROCARRAY_ANALYZE_FLAG 0x04 /* currently running
+ * analyze */
+ #define PROCARRAY_LOGICAL_DECODING_FLAG 0x10 /* currently doing
+ * logical decoding
+ * outside xact */
+
+ #define PROCARRAY_SLOTS_XMIN 0x20 /* replication slot
+ * xmin, catalog_xmin */
+ /*
+ * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching
+ * PGXACT->vacuumFlags. Other flags are used for different purposes and
+ * have no corresponding PROC flag equivalent.
+ */
+ #define PROCARRAY_PROC_FLAGS_MASK (PROCARRAY_VACUUM_FLAG | \
+ PROCARRAY_ANALYZE_FLAG | \
+ PROCARRAY_LOGICAL_DECODING_FLAG)
+
+ /* Use the following flags as an input "flags" to GetOldestXmin function */
+ /* Consider all backends except for logical decoding ones which manage xmin separately */
+ #define PROCARRAY_FLAGS_DEFAULT PROCARRAY_LOGICAL_DECODING_FLAG
+ /* Ignore vacuum backends */
+ #define PROCARRAY_FLAGS_VACUUM PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG
+ /* Ignore analyze backends */
+ #define PROCARRAY_FLAGS_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG
+ /* Ignore both vacuum and analyze backends */
+ #define PROCARRAY_FLAGS_VACUUM_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG
+
extern Size ProcArrayShmemSize(void);
extern void CreateSharedProcArray(void);
extern void ProcArrayAdd(PGPROC *proc);
extern bool TransactionIdIsInProgress(TransactionId xid);
extern bool TransactionIdIsActive(TransactionId xid);
- extern TransactionId GetOldestXmin(Relation rel, bool ignoreVacuum);
- extern TransactionId GetOldestXminInternal(Relation rel, bool ignoreVacuum,
+ extern TransactionId GetOldestXmin(Relation rel, int flags);
++extern TransactionId GetOldestXminInternal(Relation rel, int flags,
+ bool computeLocal, TransactionId lastGlobalXmin);
extern TransactionId GetOldestActiveTransactionId(void);
- extern TransactionId GetOldestSafeDecodingTransactionId(void);
+ extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
* Routines for interprocess signalling
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/procsignal.h
* storage manager switch public interface declarations.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/smgr.h
* calls in portal and cursor manipulations.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/tcop/dest.h
* prototypes for pquery.c.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/tcop/pquery.h
#include "nodes/plannodes.h"
#include "storage/procsignal.h"
#include "utils/guc.h"
+ #include "utils/queryenvironment.h"
+/* needed because of 'struct timeval' and 'struct rusage' */
+#include <sys/time.h>
+#include <sys/resource.h>
/* Required daylight between max_stack_depth and the kernel limit, in bytes */
#define STACK_DEPTH_SLOP (512 * 1024L)
extern int log_statement;
extern List *pg_parse_query(const char *query_string);
- extern List *pg_analyze_and_rewrite(Node *parsetree, const char *query_string,
- Oid *paramTypes, int numParams);
- extern List *pg_analyze_and_rewrite_params(Node *parsetree,
+extern List *pg_parse_query_get_source(const char *query_string, List **queries);
+ extern List *pg_analyze_and_rewrite(RawStmt *parsetree,
+ const char *query_string,
+ Oid *paramTypes, int numParams,
+ QueryEnvironment *queryEnv);
+ extern List *pg_analyze_and_rewrite_params(RawStmt *parsetree,
const char *query_string,
ParserSetupHook parserSetup,
- void *parserSetupArg);
+ void *parserSetupArg,
+ QueryEnvironment *queryEnv);
extern PlannedStmt *pg_plan_query(Query *querytree, int cursorOptions,
ParamListInfo boundParams);
extern List *pg_plan_queries(List *querytrees, int cursorOptions,
} ProcessUtilityContext;
/* Hook for plugins to get control in ProcessUtility() */
- typedef void (*ProcessUtility_hook_type) (Node *parsetree,
+ typedef void (*ProcessUtility_hook_type) (PlannedStmt *pstmt,
const char *queryString, ProcessUtilityContext context,
ParamListInfo params,
- DestReceiver *dest, char *completionTag);
+ QueryEnvironment *queryEnv,
- #ifdef PGXC
+ DestReceiver *dest,
- #endif /* PGXC */
+ bool sentToRemote,
+ char *completionTag);
extern PGDLLIMPORT ProcessUtility_hook_type ProcessUtility_hook;
- extern void ProcessUtility(Node *parsetree, const char *queryString,
+ extern void ProcessUtility(PlannedStmt *pstmt, const char *queryString,
ProcessUtilityContext context, ParamListInfo params,
- DestReceiver *dest, char *completionTag);
+ QueryEnvironment *queryEnv,
- #ifdef PGXC
+ DestReceiver *dest,
- #endif /* PGXC */
+ bool sentToRemote,
- extern void standard_ProcessUtility(Node *parsetree, const char *queryString,
+ char *completionTag);
+ extern void standard_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
ProcessUtilityContext context, ParamListInfo params,
- DestReceiver *dest, char *completionTag);
+ QueryEnvironment *queryEnv,
- #ifdef PGXC
+ DestReceiver *dest,
- #endif /* PGXC */
+ bool sentToRemote,
+ char *completionTag);
extern bool UtilityReturnsTuples(Node *parsetree);
extern LogStmtLevel GetCommandLogLevel(Node *parsetree);
- extern bool CommandIsReadOnly(Node *parsetree);
+ extern bool CommandIsReadOnly(PlannedStmt *pstmt);
+#ifdef PGXC
+extern bool pgxc_lock_for_utility_stmt(Node *parsetree);
+#endif
+
#endif /* UTILITY_H */
* Declarations for operations on built-in types.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/utils/builtins.h
#define BUILTINS_H
#include "fmgr.h"
- #include "lib/stringinfo.h"
+#include "nodes/parsenodes.h"
+#ifdef PGXC
+#include "lib/stringinfo.h"
+#endif
- #include "utils/sortsupport.h"
- /*
- * Defined in adt/
- */
-
- /* acl.c */
- extern Datum has_any_column_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_any_column_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_name_attnum(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_column_privilege_id_attnum(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_table_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_sequence_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_database_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_foreign_data_wrapper_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_function_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_language_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_schema_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_server_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_tablespace_privilege_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id_id(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_name(PG_FUNCTION_ARGS);
- extern Datum has_type_privilege_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id_id(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_name(PG_FUNCTION_ARGS);
- extern Datum pg_has_role_id(PG_FUNCTION_ARGS);
-
- /* amutils.c */
- extern Datum pg_indexam_has_property(PG_FUNCTION_ARGS);
- extern Datum pg_index_has_property(PG_FUNCTION_ARGS);
- extern Datum pg_index_column_has_property(PG_FUNCTION_ARGS);
+ #include "nodes/nodes.h"
+ #include "utils/fmgrprotos.h"
-
/* bool.c */
- extern Datum boolin(PG_FUNCTION_ARGS);
- extern Datum boolout(PG_FUNCTION_ARGS);
- extern Datum boolrecv(PG_FUNCTION_ARGS);
- extern Datum boolsend(PG_FUNCTION_ARGS);
- extern Datum booltext(PG_FUNCTION_ARGS);
- extern Datum booleq(PG_FUNCTION_ARGS);
- extern Datum boolne(PG_FUNCTION_ARGS);
- extern Datum boollt(PG_FUNCTION_ARGS);
- extern Datum boolgt(PG_FUNCTION_ARGS);
- extern Datum boolle(PG_FUNCTION_ARGS);
- extern Datum boolge(PG_FUNCTION_ARGS);
- extern Datum booland_statefunc(PG_FUNCTION_ARGS);
- extern Datum boolor_statefunc(PG_FUNCTION_ARGS);
- extern Datum bool_accum(PG_FUNCTION_ARGS);
- extern Datum bool_accum_inv(PG_FUNCTION_ARGS);
- extern Datum bool_alltrue(PG_FUNCTION_ARGS);
- extern Datum bool_anytrue(PG_FUNCTION_ARGS);
extern bool parse_bool(const char *value, bool *result);
extern bool parse_bool_with_len(const char *value, size_t len, bool *result);
extern int float4_cmp_internal(float4 a, float4 b);
extern int float8_cmp_internal(float8 a, float8 b);
- extern Datum float4in(PG_FUNCTION_ARGS);
- extern Datum float4out(PG_FUNCTION_ARGS);
- extern Datum float4recv(PG_FUNCTION_ARGS);
- extern Datum float4send(PG_FUNCTION_ARGS);
- extern Datum float8in(PG_FUNCTION_ARGS);
- extern Datum float8out(PG_FUNCTION_ARGS);
- extern Datum float8recv(PG_FUNCTION_ARGS);
- extern Datum float8send(PG_FUNCTION_ARGS);
- extern Datum float4abs(PG_FUNCTION_ARGS);
- extern Datum float4um(PG_FUNCTION_ARGS);
- extern Datum float4up(PG_FUNCTION_ARGS);
- extern Datum float4larger(PG_FUNCTION_ARGS);
- extern Datum float4smaller(PG_FUNCTION_ARGS);
- extern Datum float8abs(PG_FUNCTION_ARGS);
- extern Datum float8um(PG_FUNCTION_ARGS);
- extern Datum float8up(PG_FUNCTION_ARGS);
- extern Datum float8larger(PG_FUNCTION_ARGS);
- extern Datum float8smaller(PG_FUNCTION_ARGS);
- extern Datum float4pl(PG_FUNCTION_ARGS);
- extern Datum float4mi(PG_FUNCTION_ARGS);
- extern Datum float4mul(PG_FUNCTION_ARGS);
- extern Datum float4div(PG_FUNCTION_ARGS);
- extern Datum float8pl(PG_FUNCTION_ARGS);
- extern Datum float8mi(PG_FUNCTION_ARGS);
- extern Datum float8mul(PG_FUNCTION_ARGS);
- extern Datum float8div(PG_FUNCTION_ARGS);
- extern Datum float4eq(PG_FUNCTION_ARGS);
- extern Datum float4ne(PG_FUNCTION_ARGS);
- extern Datum float4lt(PG_FUNCTION_ARGS);
- extern Datum float4le(PG_FUNCTION_ARGS);
- extern Datum float4gt(PG_FUNCTION_ARGS);
- extern Datum float4ge(PG_FUNCTION_ARGS);
- extern Datum float8eq(PG_FUNCTION_ARGS);
- extern Datum float8ne(PG_FUNCTION_ARGS);
- extern Datum float8lt(PG_FUNCTION_ARGS);
- extern Datum float8le(PG_FUNCTION_ARGS);
- extern Datum float8gt(PG_FUNCTION_ARGS);
- extern Datum float8ge(PG_FUNCTION_ARGS);
- extern Datum ftod(PG_FUNCTION_ARGS);
- extern Datum i4tod(PG_FUNCTION_ARGS);
- extern Datum i2tod(PG_FUNCTION_ARGS);
- extern Datum dtof(PG_FUNCTION_ARGS);
- extern Datum dtoi4(PG_FUNCTION_ARGS);
- extern Datum dtoi2(PG_FUNCTION_ARGS);
- extern Datum i4tof(PG_FUNCTION_ARGS);
- extern Datum i2tof(PG_FUNCTION_ARGS);
- extern Datum ftoi4(PG_FUNCTION_ARGS);
- extern Datum ftoi2(PG_FUNCTION_ARGS);
- extern Datum dround(PG_FUNCTION_ARGS);
- extern Datum dceil(PG_FUNCTION_ARGS);
- extern Datum dfloor(PG_FUNCTION_ARGS);
- extern Datum dsign(PG_FUNCTION_ARGS);
- extern Datum dtrunc(PG_FUNCTION_ARGS);
- extern Datum dsqrt(PG_FUNCTION_ARGS);
- extern Datum dcbrt(PG_FUNCTION_ARGS);
- extern Datum dpow(PG_FUNCTION_ARGS);
- extern Datum dexp(PG_FUNCTION_ARGS);
- extern Datum dlog1(PG_FUNCTION_ARGS);
- extern Datum dlog10(PG_FUNCTION_ARGS);
- extern Datum dacos(PG_FUNCTION_ARGS);
- extern Datum dasin(PG_FUNCTION_ARGS);
- extern Datum datan(PG_FUNCTION_ARGS);
- extern Datum datan2(PG_FUNCTION_ARGS);
- extern Datum dcos(PG_FUNCTION_ARGS);
- extern Datum dcot(PG_FUNCTION_ARGS);
- extern Datum dsin(PG_FUNCTION_ARGS);
- extern Datum dtan(PG_FUNCTION_ARGS);
- extern Datum dacosd(PG_FUNCTION_ARGS);
- extern Datum dasind(PG_FUNCTION_ARGS);
- extern Datum datand(PG_FUNCTION_ARGS);
- extern Datum datan2d(PG_FUNCTION_ARGS);
- extern Datum dcosd(PG_FUNCTION_ARGS);
- extern Datum dcotd(PG_FUNCTION_ARGS);
- extern Datum dsind(PG_FUNCTION_ARGS);
- extern Datum dtand(PG_FUNCTION_ARGS);
- extern Datum degrees(PG_FUNCTION_ARGS);
- extern Datum dpi(PG_FUNCTION_ARGS);
- extern Datum radians(PG_FUNCTION_ARGS);
- extern Datum drandom(PG_FUNCTION_ARGS);
- extern Datum setseed(PG_FUNCTION_ARGS);
- extern Datum float8_combine(PG_FUNCTION_ARGS);
- extern Datum float8_accum(PG_FUNCTION_ARGS);
- extern Datum float4_accum(PG_FUNCTION_ARGS);
- extern Datum float8_avg(PG_FUNCTION_ARGS);
- extern Datum float8_var_pop(PG_FUNCTION_ARGS);
- extern Datum float8_var_samp(PG_FUNCTION_ARGS);
- extern Datum float8_stddev_pop(PG_FUNCTION_ARGS);
- extern Datum float8_stddev_samp(PG_FUNCTION_ARGS);
- extern Datum float8_regr_accum(PG_FUNCTION_ARGS);
- extern Datum float8_regr_combine(PG_FUNCTION_ARGS);
- extern Datum float8_regr_sxx(PG_FUNCTION_ARGS);
- extern Datum float8_regr_syy(PG_FUNCTION_ARGS);
- extern Datum float8_regr_sxy(PG_FUNCTION_ARGS);
- extern Datum float8_regr_avgx(PG_FUNCTION_ARGS);
- extern Datum float8_regr_avgy(PG_FUNCTION_ARGS);
- extern Datum float8_covar_pop(PG_FUNCTION_ARGS);
- extern Datum float8_covar_samp(PG_FUNCTION_ARGS);
- extern Datum float8_corr(PG_FUNCTION_ARGS);
- extern Datum float8_regr_r2(PG_FUNCTION_ARGS);
- extern Datum float8_regr_slope(PG_FUNCTION_ARGS);
- extern Datum float8_regr_intercept(PG_FUNCTION_ARGS);
- extern Datum float48pl(PG_FUNCTION_ARGS);
- extern Datum float48mi(PG_FUNCTION_ARGS);
- extern Datum float48mul(PG_FUNCTION_ARGS);
- extern Datum float48div(PG_FUNCTION_ARGS);
- extern Datum float84pl(PG_FUNCTION_ARGS);
- extern Datum float84mi(PG_FUNCTION_ARGS);
- extern Datum float84mul(PG_FUNCTION_ARGS);
- extern Datum float84div(PG_FUNCTION_ARGS);
- extern Datum float48eq(PG_FUNCTION_ARGS);
- extern Datum float48ne(PG_FUNCTION_ARGS);
- extern Datum float48lt(PG_FUNCTION_ARGS);
- extern Datum float48le(PG_FUNCTION_ARGS);
- extern Datum float48gt(PG_FUNCTION_ARGS);
- extern Datum float48ge(PG_FUNCTION_ARGS);
- extern Datum float84eq(PG_FUNCTION_ARGS);
- extern Datum float84ne(PG_FUNCTION_ARGS);
- extern Datum float84lt(PG_FUNCTION_ARGS);
- extern Datum float84le(PG_FUNCTION_ARGS);
- extern Datum float84gt(PG_FUNCTION_ARGS);
- extern Datum float84ge(PG_FUNCTION_ARGS);
- extern Datum width_bucket_float8(PG_FUNCTION_ARGS);
-
- /* dbsize.c */
- extern Datum pg_tablespace_size_oid(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_size_name(PG_FUNCTION_ARGS);
- extern Datum pg_database_size_oid(PG_FUNCTION_ARGS);
- extern Datum pg_database_size_name(PG_FUNCTION_ARGS);
- extern Datum pg_relation_size(PG_FUNCTION_ARGS);
- extern Datum pg_total_relation_size(PG_FUNCTION_ARGS);
- extern Datum pg_size_pretty(PG_FUNCTION_ARGS);
- extern Datum pg_size_pretty_numeric(PG_FUNCTION_ARGS);
- extern Datum pg_size_bytes(PG_FUNCTION_ARGS);
- extern Datum pg_table_size(PG_FUNCTION_ARGS);
- extern Datum pg_indexes_size(PG_FUNCTION_ARGS);
- extern Datum pg_relation_filenode(PG_FUNCTION_ARGS);
- extern Datum pg_filenode_relation(PG_FUNCTION_ARGS);
- extern Datum pg_relation_filepath(PG_FUNCTION_ARGS);
-
- /* genfile.c */
- extern Datum pg_stat_file(PG_FUNCTION_ARGS);
- extern Datum pg_stat_file_1arg(PG_FUNCTION_ARGS);
- extern Datum pg_read_file(PG_FUNCTION_ARGS);
- extern Datum pg_read_file_off_len(PG_FUNCTION_ARGS);
- extern Datum pg_read_file_all(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file_off_len(PG_FUNCTION_ARGS);
- extern Datum pg_read_binary_file_all(PG_FUNCTION_ARGS);
- extern Datum pg_ls_dir(PG_FUNCTION_ARGS);
- extern Datum pg_ls_dir_1arg(PG_FUNCTION_ARGS);
-
- /* misc.c */
- extern Datum pg_num_nulls(PG_FUNCTION_ARGS);
- extern Datum pg_num_nonnulls(PG_FUNCTION_ARGS);
- extern Datum current_database(PG_FUNCTION_ARGS);
- extern Datum current_query(PG_FUNCTION_ARGS);
- extern Datum pg_cancel_backend(PG_FUNCTION_ARGS);
- extern Datum pg_terminate_backend(PG_FUNCTION_ARGS);
- extern Datum pg_reload_conf(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_databases(PG_FUNCTION_ARGS);
- extern Datum pg_tablespace_location(PG_FUNCTION_ARGS);
- extern Datum pg_rotate_logfile(PG_FUNCTION_ARGS);
- extern Datum pg_sleep(PG_FUNCTION_ARGS);
- extern Datum pg_get_keywords(PG_FUNCTION_ARGS);
- extern Datum pg_typeof(PG_FUNCTION_ARGS);
- extern Datum pg_collation_for(PG_FUNCTION_ARGS);
- extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS);
- extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS);
- extern Datum parse_ident(PG_FUNCTION_ARGS);
-
/* oid.c */
- extern Datum oidin(PG_FUNCTION_ARGS);
- extern Datum oidout(PG_FUNCTION_ARGS);
- extern Datum oidrecv(PG_FUNCTION_ARGS);
- extern Datum oidsend(PG_FUNCTION_ARGS);
- extern Datum oideq(PG_FUNCTION_ARGS);
- extern Datum oidne(PG_FUNCTION_ARGS);
- extern Datum oidlt(PG_FUNCTION_ARGS);
- extern Datum oidle(PG_FUNCTION_ARGS);
- extern Datum oidge(PG_FUNCTION_ARGS);
- extern Datum oidgt(PG_FUNCTION_ARGS);
- extern Datum oidlarger(PG_FUNCTION_ARGS);
- extern Datum oidsmaller(PG_FUNCTION_ARGS);
- extern Datum oidvectorin(PG_FUNCTION_ARGS);
- extern Datum oidvectorout(PG_FUNCTION_ARGS);
- extern Datum oidvectorrecv(PG_FUNCTION_ARGS);
- extern Datum oidvectorsend(PG_FUNCTION_ARGS);
- extern Datum oidvectoreq(PG_FUNCTION_ARGS);
- extern Datum oidvectorne(PG_FUNCTION_ARGS);
- extern Datum oidvectorlt(PG_FUNCTION_ARGS);
- extern Datum oidvectorle(PG_FUNCTION_ARGS);
- extern Datum oidvectorge(PG_FUNCTION_ARGS);
- extern Datum oidvectorgt(PG_FUNCTION_ARGS);
extern oidvector *buildoidvector(const Oid *oids, int n);
extern Oid oidparse(Node *node);
-
- /* orderedsetaggs.c */
- extern Datum ordered_set_transition(PG_FUNCTION_ARGS);
- extern Datum ordered_set_transition_multi(PG_FUNCTION_ARGS);
- extern Datum percentile_disc_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_float8_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_interval_final(PG_FUNCTION_ARGS);
- extern Datum percentile_disc_multi_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_float8_multi_final(PG_FUNCTION_ARGS);
- extern Datum percentile_cont_interval_multi_final(PG_FUNCTION_ARGS);
- extern Datum mode_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_rank_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_percent_rank_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_cume_dist_final(PG_FUNCTION_ARGS);
- extern Datum hypothetical_dense_rank_final(PG_FUNCTION_ARGS);
-
- /* pseudotypes.c */
- extern Datum cstring_in(PG_FUNCTION_ARGS);
- extern Datum cstring_out(PG_FUNCTION_ARGS);
- extern Datum cstring_recv(PG_FUNCTION_ARGS);
- extern Datum cstring_send(PG_FUNCTION_ARGS);
- extern Datum any_in(PG_FUNCTION_ARGS);
- extern Datum any_out(PG_FUNCTION_ARGS);
- extern Datum anyarray_in(PG_FUNCTION_ARGS);
- extern Datum anyarray_out(PG_FUNCTION_ARGS);
- extern Datum anyarray_recv(PG_FUNCTION_ARGS);
- extern Datum anyarray_send(PG_FUNCTION_ARGS);
- extern Datum anynonarray_in(PG_FUNCTION_ARGS);
- extern Datum anynonarray_out(PG_FUNCTION_ARGS);
- extern Datum anyenum_in(PG_FUNCTION_ARGS);
- extern Datum anyenum_out(PG_FUNCTION_ARGS);
- extern Datum anyrange_in(PG_FUNCTION_ARGS);
- extern Datum anyrange_out(PG_FUNCTION_ARGS);
- extern Datum void_in(PG_FUNCTION_ARGS);
- extern Datum void_out(PG_FUNCTION_ARGS);
- extern Datum void_recv(PG_FUNCTION_ARGS);
- extern Datum void_send(PG_FUNCTION_ARGS);
+#ifdef PGXC
+extern Datum pgxc_node_str (PG_FUNCTION_ARGS);
+extern Datum pgxc_lock_for_backup (PG_FUNCTION_ARGS);
+#endif
- extern Datum trigger_in(PG_FUNCTION_ARGS);
- extern Datum trigger_out(PG_FUNCTION_ARGS);
- extern Datum event_trigger_in(PG_FUNCTION_ARGS);
- extern Datum event_trigger_out(PG_FUNCTION_ARGS);
- extern Datum language_handler_in(PG_FUNCTION_ARGS);
- extern Datum language_handler_out(PG_FUNCTION_ARGS);
- extern Datum fdw_handler_in(PG_FUNCTION_ARGS);
- extern Datum fdw_handler_out(PG_FUNCTION_ARGS);
- extern Datum index_am_handler_in(PG_FUNCTION_ARGS);
- extern Datum index_am_handler_out(PG_FUNCTION_ARGS);
- extern Datum tsm_handler_in(PG_FUNCTION_ARGS);
- extern Datum tsm_handler_out(PG_FUNCTION_ARGS);
- extern Datum internal_in(PG_FUNCTION_ARGS);
- extern Datum internal_out(PG_FUNCTION_ARGS);
- extern Datum opaque_in(PG_FUNCTION_ARGS);
- extern Datum opaque_out(PG_FUNCTION_ARGS);
- extern Datum anyelement_in(PG_FUNCTION_ARGS);
- extern Datum anyelement_out(PG_FUNCTION_ARGS);
- extern Datum shell_in(PG_FUNCTION_ARGS);
- extern Datum shell_out(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_in(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_out(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_recv(PG_FUNCTION_ARGS);
- extern Datum pg_node_tree_send(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_in(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_out(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_recv(PG_FUNCTION_ARGS);
- extern Datum pg_ddl_command_send(PG_FUNCTION_ARGS);
+ extern int oid_cmp(const void *p1, const void *p2);
/* regexp.c */
- extern Datum nameregexeq(PG_FUNCTION_ARGS);
- extern Datum nameregexne(PG_FUNCTION_ARGS);
- extern Datum textregexeq(PG_FUNCTION_ARGS);
- extern Datum textregexne(PG_FUNCTION_ARGS);
- extern Datum nameicregexeq(PG_FUNCTION_ARGS);
- extern Datum nameicregexne(PG_FUNCTION_ARGS);
- extern Datum texticregexeq(PG_FUNCTION_ARGS);
- extern Datum texticregexne(PG_FUNCTION_ARGS);
- extern Datum textregexsubstr(PG_FUNCTION_ARGS);
- extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
- extern Datum textregexreplace(PG_FUNCTION_ARGS);
- extern Datum similar_escape(PG_FUNCTION_ARGS);
- extern Datum regexp_matches(PG_FUNCTION_ARGS);
- extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
- extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS);
extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
Oid collation, bool *exact);
- /* regproc.c */
- extern Datum regprocin(PG_FUNCTION_ARGS);
- extern Datum regprocout(PG_FUNCTION_ARGS);
- extern Datum to_regproc(PG_FUNCTION_ARGS);
- extern Datum to_regprocedure(PG_FUNCTION_ARGS);
- extern Datum regprocrecv(PG_FUNCTION_ARGS);
- extern Datum regprocsend(PG_FUNCTION_ARGS);
- extern Datum regprocedurein(PG_FUNCTION_ARGS);
- extern Datum regprocedureout(PG_FUNCTION_ARGS);
- extern Datum regprocedurerecv(PG_FUNCTION_ARGS);
- extern Datum regproceduresend(PG_FUNCTION_ARGS);
- extern Datum regoperin(PG_FUNCTION_ARGS);
- extern Datum regoperout(PG_FUNCTION_ARGS);
- extern Datum regoperrecv(PG_FUNCTION_ARGS);
- extern Datum regopersend(PG_FUNCTION_ARGS);
- extern Datum to_regoper(PG_FUNCTION_ARGS);
- extern Datum to_regoperator(PG_FUNCTION_ARGS);
- extern Datum regoperatorin(PG_FUNCTION_ARGS);
- extern Datum regoperatorout(PG_FUNCTION_ARGS);
- extern Datum regoperatorrecv(PG_FUNCTION_ARGS);
- extern Datum regoperatorsend(PG_FUNCTION_ARGS);
- extern Datum regclassin(PG_FUNCTION_ARGS);
- extern Datum regclassout(PG_FUNCTION_ARGS);
- extern Datum regclassrecv(PG_FUNCTION_ARGS);
- extern Datum regclasssend(PG_FUNCTION_ARGS);
- extern Datum to_regclass(PG_FUNCTION_ARGS);
- extern Datum regtypein(PG_FUNCTION_ARGS);
- extern Datum regtypeout(PG_FUNCTION_ARGS);
- extern Datum regtyperecv(PG_FUNCTION_ARGS);
- extern Datum regtypesend(PG_FUNCTION_ARGS);
- extern Datum to_regtype(PG_FUNCTION_ARGS);
- extern Datum regrolein(PG_FUNCTION_ARGS);
- extern Datum regroleout(PG_FUNCTION_ARGS);
- extern Datum regrolerecv(PG_FUNCTION_ARGS);
- extern Datum regrolesend(PG_FUNCTION_ARGS);
- extern Datum to_regrole(PG_FUNCTION_ARGS);
- extern Datum regnamespacein(PG_FUNCTION_ARGS);
- extern Datum regnamespaceout(PG_FUNCTION_ARGS);
- extern Datum regnamespacerecv(PG_FUNCTION_ARGS);
- extern Datum regnamespacesend(PG_FUNCTION_ARGS);
- extern Datum to_regnamespace(PG_FUNCTION_ARGS);
- extern Datum regconfigin(PG_FUNCTION_ARGS);
- extern Datum regconfigout(PG_FUNCTION_ARGS);
- extern Datum regconfigrecv(PG_FUNCTION_ARGS);
- extern Datum regconfigsend(PG_FUNCTION_ARGS);
- extern Datum regdictionaryin(PG_FUNCTION_ARGS);
- extern Datum regdictionaryout(PG_FUNCTION_ARGS);
- extern Datum regdictionaryrecv(PG_FUNCTION_ARGS);
- extern Datum regdictionarysend(PG_FUNCTION_ARGS);
- extern Datum text_regclass(PG_FUNCTION_ARGS);
- extern List *stringToQualifiedNameList(const char *string);
- extern char *format_procedure(Oid procedure_oid);
- extern char *format_procedure_qualified(Oid procedure_oid);
- extern void format_procedure_parts(Oid operator_oid, List **objnames,
- List **objargs);
- extern char *format_operator(Oid operator_oid);
- extern char *format_operator_qualified(Oid operator_oid);
- extern void format_operator_parts(Oid operator_oid, List **objnames,
- List **objargs);
-
- /* rowtypes.c */
- extern Datum record_in(PG_FUNCTION_ARGS);
- extern Datum record_out(PG_FUNCTION_ARGS);
- extern Datum record_recv(PG_FUNCTION_ARGS);
- extern Datum record_send(PG_FUNCTION_ARGS);
- extern Datum record_eq(PG_FUNCTION_ARGS);
- extern Datum record_ne(PG_FUNCTION_ARGS);
- extern Datum record_lt(PG_FUNCTION_ARGS);
- extern Datum record_gt(PG_FUNCTION_ARGS);
- extern Datum record_le(PG_FUNCTION_ARGS);
- extern Datum record_ge(PG_FUNCTION_ARGS);
- extern Datum btrecordcmp(PG_FUNCTION_ARGS);
- extern Datum record_image_eq(PG_FUNCTION_ARGS);
- extern Datum record_image_ne(PG_FUNCTION_ARGS);
- extern Datum record_image_lt(PG_FUNCTION_ARGS);
- extern Datum record_image_gt(PG_FUNCTION_ARGS);
- extern Datum record_image_le(PG_FUNCTION_ARGS);
- extern Datum record_image_ge(PG_FUNCTION_ARGS);
- extern Datum btrecordimagecmp(PG_FUNCTION_ARGS);
-
/* ruleutils.c */
extern bool quote_all_identifiers;
- extern Datum pg_get_ruledef(PG_FUNCTION_ARGS);
- extern Datum pg_get_ruledef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_wrap(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_name(PG_FUNCTION_ARGS);
- extern Datum pg_get_viewdef_name_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_indexdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_indexdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_triggerdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_triggerdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_constraintdef(PG_FUNCTION_ARGS);
- extern Datum pg_get_constraintdef_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_expr(PG_FUNCTION_ARGS);
- extern Datum pg_get_expr_ext(PG_FUNCTION_ARGS);
- extern Datum pg_get_userbyid(PG_FUNCTION_ARGS);
- extern Datum pg_get_serial_sequence(PG_FUNCTION_ARGS);
- extern Datum pg_get_functiondef(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_arguments(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_identity_arguments(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_result(PG_FUNCTION_ARGS);
- extern Datum pg_get_function_arg_default(PG_FUNCTION_ARGS);
+#ifdef PGXC
+extern void get_query_def_from_valuesList(Query *query, StringInfo buf);
+extern void deparse_query(Query *query, StringInfo buf, List *parentnamespace,
+ bool finalise_aggs, bool sortgroup_colno);
+#endif
+#ifdef PGXC
+extern List *deparse_context_for_plan(Node *plan, List *ancestors,
+ List *rtable);
+#endif
extern const char *quote_identifier(const char *ident);
extern char *quote_qualified_identifier(const char *qualifier,
const char *ident);
extern int32 type_maximum_size(Oid type_oid, int32 typemod);
/* quote.c */
- extern Datum quote_ident(PG_FUNCTION_ARGS);
- extern Datum quote_literal(PG_FUNCTION_ARGS);
extern char *quote_literal_cstr(const char *rawstr);
- extern Datum quote_nullable(PG_FUNCTION_ARGS);
-
- /* guc.c */
- extern Datum show_config_by_name(PG_FUNCTION_ARGS);
- extern Datum show_config_by_name_missing_ok(PG_FUNCTION_ARGS);
- extern Datum set_config_by_name(PG_FUNCTION_ARGS);
- extern Datum show_all_settings(PG_FUNCTION_ARGS);
- extern Datum show_all_file_settings(PG_FUNCTION_ARGS);
-
- /* pg_config.c */
- extern Datum pg_config(PG_FUNCTION_ARGS);
-
- /* pg_controldata.c */
- extern Datum pg_control_checkpoint(PG_FUNCTION_ARGS);
- extern Datum pg_control_system(PG_FUNCTION_ARGS);
- extern Datum pg_control_init(PG_FUNCTION_ARGS);
- extern Datum pg_control_recovery(PG_FUNCTION_ARGS);
-
- /* rls.c */
- extern Datum row_security_active(PG_FUNCTION_ARGS);
- extern Datum row_security_active_name(PG_FUNCTION_ARGS);
-
- /* lockfuncs.c */
- extern Datum pg_lock_status(PG_FUNCTION_ARGS);
- extern Datum pg_blocking_pids(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_shared_int8(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_try_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_shared_int4(PG_FUNCTION_ARGS);
- extern Datum pg_advisory_unlock_all(PG_FUNCTION_ARGS);
-
- /* txid.c */
- extern Datum txid_snapshot_in(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_out(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_recv(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_send(PG_FUNCTION_ARGS);
- extern Datum txid_current(PG_FUNCTION_ARGS);
- extern Datum txid_current_snapshot(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xmin(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xmax(PG_FUNCTION_ARGS);
- extern Datum txid_snapshot_xip(PG_FUNCTION_ARGS);
- extern Datum txid_visible_in_snapshot(PG_FUNCTION_ARGS);
-
- /* uuid.c */
- extern Datum uuid_in(PG_FUNCTION_ARGS);
- extern Datum uuid_out(PG_FUNCTION_ARGS);
- extern Datum uuid_send(PG_FUNCTION_ARGS);
- extern Datum uuid_recv(PG_FUNCTION_ARGS);
- extern Datum uuid_lt(PG_FUNCTION_ARGS);
- extern Datum uuid_le(PG_FUNCTION_ARGS);
- extern Datum uuid_eq(PG_FUNCTION_ARGS);
- extern Datum uuid_ge(PG_FUNCTION_ARGS);
- extern Datum uuid_gt(PG_FUNCTION_ARGS);
- extern Datum uuid_ne(PG_FUNCTION_ARGS);
- extern Datum uuid_cmp(PG_FUNCTION_ARGS);
- extern Datum uuid_sortsupport(PG_FUNCTION_ARGS);
- extern Datum uuid_hash(PG_FUNCTION_ARGS);
-
- /* windowfuncs.c */
- extern Datum window_row_number(PG_FUNCTION_ARGS);
- extern Datum window_rank(PG_FUNCTION_ARGS);
- extern Datum window_dense_rank(PG_FUNCTION_ARGS);
- extern Datum window_percent_rank(PG_FUNCTION_ARGS);
- extern Datum window_cume_dist(PG_FUNCTION_ARGS);
- extern Datum window_ntile(PG_FUNCTION_ARGS);
- extern Datum window_lag(PG_FUNCTION_ARGS);
- extern Datum window_lag_with_offset(PG_FUNCTION_ARGS);
- extern Datum window_lag_with_offset_and_default(PG_FUNCTION_ARGS);
- extern Datum window_lead(PG_FUNCTION_ARGS);
- extern Datum window_lead_with_offset(PG_FUNCTION_ARGS);
- extern Datum window_lead_with_offset_and_default(PG_FUNCTION_ARGS);
- extern Datum window_first_value(PG_FUNCTION_ARGS);
- extern Datum window_last_value(PG_FUNCTION_ARGS);
- extern Datum window_nth_value(PG_FUNCTION_ARGS);
-
- /* access/spgist/spgquadtreeproc.c */
- extern Datum spg_quad_config(PG_FUNCTION_ARGS);
- extern Datum spg_quad_choose(PG_FUNCTION_ARGS);
- extern Datum spg_quad_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_quad_inner_consistent(PG_FUNCTION_ARGS);
- extern Datum spg_quad_leaf_consistent(PG_FUNCTION_ARGS);
-
- /* access/spgist/spgkdtreeproc.c */
- extern Datum spg_kd_config(PG_FUNCTION_ARGS);
- extern Datum spg_kd_choose(PG_FUNCTION_ARGS);
- extern Datum spg_kd_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_kd_inner_consistent(PG_FUNCTION_ARGS);
-
- /* access/spgist/spgtextproc.c */
- extern Datum spg_text_config(PG_FUNCTION_ARGS);
- extern Datum spg_text_choose(PG_FUNCTION_ARGS);
- extern Datum spg_text_picksplit(PG_FUNCTION_ARGS);
- extern Datum spg_text_inner_consistent(PG_FUNCTION_ARGS);
- extern Datum spg_text_leaf_consistent(PG_FUNCTION_ARGS);
-
- /* access/gin/ginarrayproc.c */
- extern Datum ginarrayextract(PG_FUNCTION_ARGS);
- extern Datum ginarrayextract_2args(PG_FUNCTION_ARGS);
- extern Datum ginqueryarrayextract(PG_FUNCTION_ARGS);
- extern Datum ginarrayconsistent(PG_FUNCTION_ARGS);
- extern Datum ginarraytriconsistent(PG_FUNCTION_ARGS);
-
- /* access/tablesample/bernoulli.c */
- extern Datum tsm_bernoulli_handler(PG_FUNCTION_ARGS);
-
- /* access/tablesample/system.c */
- extern Datum tsm_system_handler(PG_FUNCTION_ARGS);
-
- /* access/transam/twophase.c */
- extern Datum pg_prepared_xact(PG_FUNCTION_ARGS);
-
- /* access/transam/multixact.c */
- extern Datum pg_get_multixact_members(PG_FUNCTION_ARGS);
-
- /* access/transam/committs.c */
- extern Datum pg_xact_commit_timestamp(PG_FUNCTION_ARGS);
- extern Datum pg_last_committed_xact(PG_FUNCTION_ARGS);
-
- /* catalogs/dependency.c */
- extern Datum pg_describe_object(PG_FUNCTION_ARGS);
- extern Datum pg_identify_object(PG_FUNCTION_ARGS);
- extern Datum pg_identify_object_as_address(PG_FUNCTION_ARGS);
-
- /* catalog/objectaddress.c */
- extern Datum pg_get_object_address(PG_FUNCTION_ARGS);
-
- /* commands/constraint.c */
- extern Datum unique_key_recheck(PG_FUNCTION_ARGS);
-
- /* commands/event_trigger.c */
- extern Datum pg_event_trigger_dropped_objects(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_table_rewrite_oid(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_table_rewrite_reason(PG_FUNCTION_ARGS);
- extern Datum pg_event_trigger_ddl_commands(PG_FUNCTION_ARGS);
-
- /* commands/extension.c */
- extern Datum pg_available_extensions(PG_FUNCTION_ARGS);
- extern Datum pg_available_extension_versions(PG_FUNCTION_ARGS);
- extern Datum pg_extension_update_paths(PG_FUNCTION_ARGS);
- extern Datum pg_extension_config_dump(PG_FUNCTION_ARGS);
-
- /* commands/prepare.c */
- extern Datum pg_prepared_statement(PG_FUNCTION_ARGS);
-
- /* utils/mmgr/portalmem.c */
- extern Datum pg_cursor(PG_FUNCTION_ARGS);
+#ifdef PGXC
+/* backend/pgxc/pool/poolutils.c */
+extern Datum pgxc_pool_check(PG_FUNCTION_ARGS);
+extern Datum pgxc_pool_reload(PG_FUNCTION_ARGS);
+
+/* backend/access/transam/transam.c */
+extern Datum pgxc_is_committed(PG_FUNCTION_ARGS);
+extern Datum pgxc_is_inprogress(PG_FUNCTION_ARGS);
+#endif
+extern Datum pg_msgmodule_set(PG_FUNCTION_ARGS);
+extern Datum pg_msgmodule_change(PG_FUNCTION_ARGS);
+extern Datum pg_msgmodule_enable(PG_FUNCTION_ARGS);
+extern Datum pg_msgmodule_disable(PG_FUNCTION_ARGS);
+extern Datum pg_msgmodule_enable_all(PG_FUNCTION_ARGS);
+extern Datum pg_msgmodule_disable_all(PG_FUNCTION_ARGS);
#endif /* BUILTINS_H */
* External declarations pertaining to backend/utils/misc/guc.c and
* backend/utils/misc/guc-file.l
*
- * Copyright (c) 2000-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Copyright (c) 2000-2017, PostgreSQL Global Development Group
*
* src/include/utils/guc.h
extern bool log_statement_stats;
extern bool log_btree_build_stats;
+#ifdef XCP
+extern bool log_gtm_stats;
+extern bool log_remotesubplan_stats;
+#endif
+
extern PGDLLIMPORT bool check_function_bodies;
extern bool default_with_oids;
- extern bool SQL_inheritance;
extern int log_min_error_statement;
extern int log_min_messages;
* lsyscache.h
* Convenience routines for common queries in the system catalog cache.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/utils/lsyscache.h
extern bool type_is_collatable(Oid typid);
extern Oid getBaseType(Oid typid);
extern Oid getBaseTypeAndTypmod(Oid typid, int32 *typmod);
+#ifdef PGXC
+extern char *get_typename(Oid typid);
+extern char *get_pgxc_nodename(Oid nodeoid);
+extern Oid get_pgxc_nodeoid(const char *nodename);
+extern uint32 get_pgxc_node_id(Oid nodeid);
+extern char get_pgxc_nodetype(Oid nodeid);
+extern int get_pgxc_nodeport(Oid nodeid);
+extern char *get_pgxc_nodehost(Oid nodeid);
+extern bool is_pgxc_nodepreferred(Oid nodeid);
+extern bool is_pgxc_nodeprimary(Oid nodeid);
+extern Oid get_pgxc_groupoid(const char *groupname);
+extern int get_pgxc_groupmembers(Oid groupid, Oid **members);
+extern int get_pgxc_classnodes(Oid tableid, Oid **nodes);
++extern char * get_pgxc_groupname(Oid groupid);
+#endif
extern int32 get_typavgwidth(Oid typid, int32 typmod);
extern int32 get_attavgwidth(Oid relid, AttrNumber attnum);
- extern bool get_attstatsslot(HeapTuple statstuple,
- Oid atttype, int32 atttypmod,
- int reqkind, Oid reqop,
- Oid *actualop,
- Datum **values, int *nvalues,
- float4 **numbers, int *nnumbers);
- extern void free_attstatsslot(Oid atttype,
- Datum *values, int nvalues,
- float4 *numbers, int nnumbers);
+ extern bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple,
+ int reqkind, Oid reqop, int flags);
+ extern void free_attstatsslot(AttStatsSlot *sslot);
extern char *get_namespace_name(Oid nspid);
+#ifdef XCP
+extern Oid get_namespaceid(const char *nspname);
+extern char *get_typ_name(Oid typid);
+extern Oid get_typ_namespace(Oid typid);
+extern Oid get_typname_typid(const char *typname, Oid typnamespace);
+extern Oid get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp);
+extern Oid get_opnamespace(Oid opno);
+extern Oid get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp);
+#endif
extern char *get_namespace_name_or_temp(Oid nspid);
extern Oid get_range_subtype(Oid rangeOid);
*
* See plancache.c for comments.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/utils/plancache.h
extern void InitPlanCache(void);
extern void ResetPlanCache(void);
- extern CachedPlanSource *CreateCachedPlan(Node *raw_parse_tree,
+ extern CachedPlanSource *CreateCachedPlan(struct RawStmt *raw_parse_tree,
const char *query_string,
+#ifdef PGXC
+ const char *stmt_name,
+#endif
const char *commandTag);
- extern CachedPlanSource *CreateOneShotCachedPlan(Node *raw_parse_tree,
+ extern CachedPlanSource *CreateOneShotCachedPlan(struct RawStmt *raw_parse_tree,
const char *query_string,
const char *commandTag);
extern void CompleteCachedPlan(CachedPlanSource *plansource,
extern CachedPlan *GetCachedPlan(CachedPlanSource *plansource,
ParamListInfo boundParams,
- bool useResOwner);
+ bool useResOwner,
+ QueryEnvironment *queryEnv);
extern void ReleaseCachedPlan(CachedPlan *plan, bool useResOwner);
+#ifdef XCP
+extern void SetRemoteSubplan(CachedPlanSource *plansource,
+ const char *plan_string);
+#endif
#endif /* PLANCACHE_H */
* to look like NO SCROLL cursors.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/utils/portal.h
const char *commandTag,
List *stmts,
CachedPlan *cplan);
- extern Node *PortalListGetPrimaryStmt(List *stmts);
+ extern PlannedStmt *PortalGetPrimaryStmt(Portal portal);
extern void PortalCreateHoldStore(Portal portal);
extern void PortalHashTableDeleteAll(void);
+#ifdef XCP
+extern void PortalCreateProducerStore(Portal portal);
+extern List *getProducingPortals(void);
+extern void addProducingPortal(Portal portal);
+extern void removeProducingPortal(Portal portal);
+extern bool portalIsProducing(Portal portal);
+#endif
extern bool ThereAreNoReadyPortals(void);
#endif /* PORTAL_H */
* POSTGRES relation descriptor (a/k/a relcache entry) definitions.
*
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/utils/rel.h
*
#include "access/xlog.h"
#include "catalog/pg_class.h"
#include "catalog/pg_index.h"
+ #include "catalog/pg_publication.h"
#include "fmgr.h"
#include "nodes/bitmapset.h"
+#include "pgxc/locator.h"
#include "rewrite/prs2lock.h"
#include "storage/block.h"
#include "storage/relfilenode.h"
* snapshot.h
* POSTGRES snapshot definition
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/utils/snapshot.h
*
* See also lsyscache.h, which provides convenience routines for
* common cache-lookup operations.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
* src/include/utils/syscache.h
*
OPEROID,
OPFAMILYAMNAMENSP,
OPFAMILYOID,
+#ifdef PGXC
+ PGXCCLASSRELID,
+ PGXCGROUPNAME,
+ PGXCGROUPOID,
+ PGXCNODENAME,
+ PGXCNODEOID,
+ PGXCNODEIDENTIFIER,
+#endif
+ PARTRELID,
PROCNAMEARGSNSP,
PROCOID,
RANGETYPE,
#define INTERVAL_PRECISION(t) ((t) & INTERVAL_PRECISION_MASK)
#define INTERVAL_RANGE(t) (((t) >> 16) & INTERVAL_RANGE_MASK)
- #ifdef HAVE_INT64_TIMESTAMP
#define TimestampTzPlusMilliseconds(tz,ms) ((tz) + ((ms) * (int64) 1000))
- #else
- #define TimestampTzPlusMilliseconds(tz,ms) ((tz) + ((ms) / 1000.0))
- #endif
+#ifdef PGXC
+#define InvalidGlobalTimestamp ((TimestampTz) 0)
+#define GlobalTimestampIsValid(timestamp) ((TimestampTz) (timestamp)) != InvalidGlobalTimestamp
+#endif
/* Set at postmaster start */
extern TimestampTz PgStartTime;
* amounts are sorted using temporary files and a standard external sort
* algorithm.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/utils/tuplesort.h
* Also, we have changed the API to return tuples in TupleTableSlots,
* so that there is a check to prevent attempted access to system columns.
*
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/utils/tuplestore.h
#include "utils/rel.h"
#include "utils/snapmgr.h"
#include "utils/typcache.h"
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#endif
+ #include "plpgsql.h"
+
typedef struct
{
Sort Key: (generate_series(1, 3)) DESC
InitPlan 1 (returns $0)
-> Limit
- -> Index Only Scan Backward using tenk1_unique2 on tenk1
- Index Cond: (unique2 IS NOT NULL)
+ -> Remote Subquery Scan on all
+ -> Limit
+ -> Index Only Scan Backward using tenk1_unique2 on tenk1
+ Index Cond: (unique2 IS NOT NULL)
- -> Result
- (9 rows)
+ -> ProjectSet
+ -> Result
-(8 rows)
++(10 rows)
select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
max | g
---
--- Cleanup resources
---
+ set client_min_messages to warning; -- suppress cascade notices
DROP FOREIGN DATA WRAPPER alt_fdw2 CASCADE;
+ERROR: foreign-data wrapper "alt_fdw2" does not exist
DROP FOREIGN DATA WRAPPER alt_fdw3 CASCADE;
+ERROR: foreign-data wrapper "alt_fdw3" does not exist
DROP LANGUAGE alt_lang2 CASCADE;
DROP LANGUAGE alt_lang3 CASCADE;
- DROP LANGUAGE alt_lang4 CASCADE;
- ERROR: language "alt_lang4" does not exist
DROP SCHEMA alt_nsp1 CASCADE;
- NOTICE: drop cascades to 26 other objects
- DETAIL: drop cascades to function alt_func3(integer)
- drop cascades to function alt_agg3(integer)
- drop cascades to function alt_func4(integer)
- drop cascades to function alt_func2(integer)
- drop cascades to function alt_agg4(integer)
- drop cascades to function alt_agg2(integer)
- drop cascades to conversion alt_conv3
- drop cascades to conversion alt_conv4
- drop cascades to conversion alt_conv2
- drop cascades to operator @+@(integer,integer)
- drop cascades to operator @-@(integer,integer)
- drop cascades to operator family alt_opf3 for access method hash
- drop cascades to operator family alt_opc1 for access method hash
- drop cascades to operator family alt_opc2 for access method hash
- drop cascades to operator family alt_opf4 for access method hash
- drop cascades to operator family alt_opf2 for access method hash
- drop cascades to text search dictionary alt_ts_dict3
- drop cascades to text search dictionary alt_ts_dict4
- drop cascades to text search dictionary alt_ts_dict2
- drop cascades to text search configuration alt_ts_conf3
- drop cascades to text search configuration alt_ts_conf4
- drop cascades to text search configuration alt_ts_conf2
- drop cascades to text search template alt_ts_temp3
- drop cascades to text search template alt_ts_temp2
- drop cascades to text search parser alt_ts_prs3
- drop cascades to text search parser alt_ts_prs2
DROP SCHEMA alt_nsp2 CASCADE;
- NOTICE: drop cascades to 9 other objects
- DETAIL: drop cascades to function alt_nsp2.alt_func2(integer)
- drop cascades to function alt_nsp2.alt_agg2(integer)
- drop cascades to conversion alt_conv2
- drop cascades to operator alt_nsp2.@-@(integer,integer)
- drop cascades to operator family alt_nsp2.alt_opf2 for access method hash
- drop cascades to text search dictionary alt_ts_dict2
- drop cascades to text search configuration alt_ts_conf2
- drop cascades to text search template alt_ts_temp2
- drop cascades to text search parser alt_ts_prs2
DROP USER regress_alter_user1;
DROP USER regress_alter_user2;
DROP USER regress_alter_user3;
ALTER TABLE tmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID;
NOTICE: merging constraint "identity" with inherited definition
ALTER TABLE tmp3 VALIDATE CONSTRAINT identity;
-NOTICE: boo: 16
-NOTICE: boo: 20
+ -- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT
+ create table parent_noinh_convalid (a int);
+ create table child_noinh_convalid () inherits (parent_noinh_convalid);
+ insert into parent_noinh_convalid values (1);
+ insert into child_noinh_convalid values (1);
+ alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid;
+ -- fail, because of the row in parent
+ alter table parent_noinh_convalid validate constraint check_a_is_2;
+ ERROR: check constraint "check_a_is_2" is violated by some row
+ delete from only parent_noinh_convalid;
+ -- ok (parent itself contains no violating rows)
+ alter table parent_noinh_convalid validate constraint check_a_is_2;
+ select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2';
+ convalidated
+ --------------
+ t
+ (1 row)
+
+ -- cleanup
+ drop table parent_noinh_convalid, child_noinh_convalid;
-- Try (and fail) to create constraint from tmp5(a) to tmp4(a) - unique constraint on
-- tmp4 is a,b
ALTER TABLE tmp5 add constraint tmpconstr foreign key(a) references tmp4(a) match full;
-- As does this...
ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1)
references pktable(ptest1, ptest2);
-ERROR: foreign key constraint "fktable_ftest2_fkey" cannot be implemented
-DETAIL: Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer.
+ERROR: Hash/Modulo distribution column does not refer to hash/modulo distribution column in referenced table.
+ DROP TABLE FKTABLE;
+ DROP TABLE PKTABLE;
+ -- Test that ALTER CONSTRAINT updates trigger deferrability properly
+ CREATE TEMP TABLE PKTABLE (ptest1 int primary key);
+ CREATE TEMP TABLE FKTABLE (ftest1 int);
+ ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable
+ ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable
+ ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable
+ ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable
+ ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable
+ ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED;
+ ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable
+ ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE;
+ SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+ FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+ WHERE tgrelid = 'pktable'::regclass
+ ORDER BY 1,2,3;
+ conname | tgfoid | tgtype | tgdeferrable | tginitdeferred
+ ---------+------------------------+--------+--------------+----------------
+ fkdd | "RI_FKey_cascade_del" | 9 | f | f
+ fkdd | "RI_FKey_noaction_upd" | 17 | t | t
+ fkdd2 | "RI_FKey_cascade_del" | 9 | f | f
+ fkdd2 | "RI_FKey_noaction_upd" | 17 | t | t
+ fkdi | "RI_FKey_cascade_del" | 9 | f | f
+ fkdi | "RI_FKey_noaction_upd" | 17 | t | f
+ fkdi2 | "RI_FKey_cascade_del" | 9 | f | f
+ fkdi2 | "RI_FKey_noaction_upd" | 17 | t | f
+ fknd | "RI_FKey_cascade_del" | 9 | f | f
+ fknd | "RI_FKey_noaction_upd" | 17 | f | f
+ fknd2 | "RI_FKey_cascade_del" | 9 | f | f
+ fknd2 | "RI_FKey_noaction_upd" | 17 | f | f
+ (12 rows)
+
+ SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+ FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+ WHERE tgrelid = 'fktable'::regclass
+ ORDER BY 1,2,3;
+ conname | tgfoid | tgtype | tgdeferrable | tginitdeferred
+ ---------+---------------------+--------+--------------+----------------
+ fkdd | "RI_FKey_check_ins" | 5 | t | t
+ fkdd | "RI_FKey_check_upd" | 17 | t | t
+ fkdd2 | "RI_FKey_check_ins" | 5 | t | t
+ fkdd2 | "RI_FKey_check_upd" | 17 | t | t
+ fkdi | "RI_FKey_check_ins" | 5 | t | f
+ fkdi | "RI_FKey_check_upd" | 17 | t | f
+ fkdi2 | "RI_FKey_check_ins" | 5 | t | f
+ fkdi2 | "RI_FKey_check_upd" | 17 | t | f
+ fknd | "RI_FKey_check_ins" | 5 | f | f
+ fknd | "RI_FKey_check_upd" | 17 | f | f
+ fknd2 | "RI_FKey_check_ins" | 5 | f | f
+ fknd2 | "RI_FKey_check_upd" | 17 | f | f
+ (12 rows)
+
-- temp tables should go away by themselves, need not drop them.
-- test check constraint adding
create table atacc1 ( test int );
(1 row)
+ -- nulls later in the bitmap
+ SELECT -1 != ALL(ARRAY(SELECT NULLIF(g.i, 900) FROM generate_series(1,1000) g(i)));
+ ?column?
+ ----------
+
+ (1 row)
+
-- test indexes on arrays
-create temp table arr_tbl (f1 int[] unique);
+-- PGXCTODO: related to feature request 3520520, this distribution type is changed
+-- to replication. As integer arrays are no available distribution types, this table
+-- should use roundrobin distribution if nothing is specified but roundrobin
+-- distribution cannot be safely used to check constraints on remote nodes.
+-- When global constraints are supported, this replication distribution should be removed.
+create temp table arr_tbl (f1 int[] unique) distribute by replication;
insert into arr_tbl values ('{1,2,3}');
insert into arr_tbl values ('{1,2}');
-- failure expected:
end $$ language plpgsql immutable;
alter table check_con_tbl add check (check_con_function(check_con_tbl.*));
\d+ check_con_tbl
- Table "public.check_con_tbl"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- f1 | integer | | plain | |
+ Table "public.check_con_tbl"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ f1 | integer | | | | plain | |
Check constraints:
"check_con_tbl_check" CHECK (check_con_function(check_con_tbl.*))
+Distribute By: HASH(f1)
+Location Nodes: ALL DATANODES
copy check_con_tbl from stdin;
-NOTICE: input = {"f1":1}
-NOTICE: input = {"f1":null}
copy check_con_tbl from stdin;
-NOTICE: input = {"f1":0}
ERROR: new row for relation "check_con_tbl" violates check constraint "check_con_tbl_check"
DETAIL: Failing row contains (0).
-CONTEXT: COPY check_con_tbl, line 1: "0"
select * from check_con_tbl;
f1
----
(2 rows)
+ -- test with RLS enabled.
+ CREATE ROLE regress_rls_copy_user;
+ CREATE ROLE regress_rls_copy_user_colperms;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ COPY rls_t1 (a, b, c) from stdin;
+ CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0);
+ ALTER TABLE rls_t1 ENABLE ROW LEVEL SECURITY;
+ ALTER TABLE rls_t1 FORCE ROW LEVEL SECURITY;
+ GRANT SELECT ON TABLE rls_t1 TO regress_rls_copy_user;
+ GRANT SELECT (a, b) ON TABLE rls_t1 TO regress_rls_copy_user_colperms;
+ -- all columns
+ COPY rls_t1 TO stdout;
+ 1 4 1
+ 2 3 2
+ 3 2 3
+ 4 1 4
+ COPY rls_t1 (a, b, c) TO stdout;
+ 1 4 1
+ 2 3 2
+ 3 2 3
+ 4 1 4
+ -- subset of columns
+ COPY rls_t1 (a) TO stdout;
+ 1
+ 2
+ 3
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 1 4
+ 2 3
+ 3 2
+ 4 1
+ -- column reordering
+ COPY rls_t1 (b, a) TO stdout;
+ 4 1
+ 3 2
+ 2 3
+ 1 4
+ SET SESSION AUTHORIZATION regress_rls_copy_user;
+ -- all columns
+ COPY rls_t1 TO stdout;
+ 2 3 2
+ 4 1 4
+ COPY rls_t1 (a, b, c) TO stdout;
+ 2 3 2
+ 4 1 4
+ -- subset of columns
+ COPY rls_t1 (a) TO stdout;
+ 2
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 2 3
+ 4 1
+ -- column reordering
+ COPY rls_t1 (b, a) TO stdout;
+ 3 2
+ 1 4
+ RESET SESSION AUTHORIZATION;
+ SET SESSION AUTHORIZATION regress_rls_copy_user_colperms;
+ -- attempt all columns (should fail)
+ COPY rls_t1 TO stdout;
+ ERROR: permission denied for relation rls_t1
+ COPY rls_t1 (a, b, c) TO stdout;
+ ERROR: permission denied for relation rls_t1
+ -- try to copy column with no privileges (should fail)
+ COPY rls_t1 (c) TO stdout;
+ ERROR: permission denied for relation rls_t1
+ -- subset of columns (should succeed)
+ COPY rls_t1 (a) TO stdout;
+ 2
+ 4
+ COPY rls_t1 (a, b) TO stdout;
+ 2 3
+ 4 1
+ RESET SESSION AUTHORIZATION;
+ -- test with INSTEAD OF INSERT trigger on a view
+ CREATE TABLE instead_of_insert_tbl(id serial, name text);
+ CREATE VIEW instead_of_insert_tbl_view AS SELECT ''::text AS str;
+ COPY instead_of_insert_tbl_view FROM stdin; -- fail
+ ERROR: cannot copy to view "instead_of_insert_tbl_view"
+ HINT: To enable copying to a view, provide an INSTEAD OF INSERT trigger.
+ CREATE FUNCTION fun_instead_of_insert_tbl() RETURNS trigger AS $$
+ BEGIN
+ INSERT INTO instead_of_insert_tbl (name) VALUES (NEW.str);
+ RETURN NULL;
+ END;
+ $$ LANGUAGE plpgsql;
+ CREATE TRIGGER trig_instead_of_insert_tbl_view
+ INSTEAD OF INSERT ON instead_of_insert_tbl_view
+ FOR EACH ROW EXECUTE PROCEDURE fun_instead_of_insert_tbl();
+ COPY instead_of_insert_tbl_view FROM stdin;
+ SELECT * FROM instead_of_insert_tbl;
+ id | name
+ ----+-------
+ 1 | test1
+ (1 row)
+
+ -- clean up
DROP TABLE forcetest;
DROP TABLE vistest;
+ERROR: table "vistest" does not exist
DROP FUNCTION truncate_in_subxact();
+ERROR: function truncate_in_subxact() does not exist
DROP TABLE x, y;
+ DROP TABLE rls_t1 CASCADE;
+ DROP ROLE regress_rls_copy_user;
+ DROP ROLE regress_rls_copy_user_colperms;
DROP FUNCTION fn_x_before();
DROP FUNCTION fn_x_after();
+ DROP TABLE instead_of_insert_tbl;
+ DROP VIEW instead_of_insert_tbl_view;
+ DROP FUNCTION fun_instead_of_insert_tbl();
-- maintenance_work_mem setting and fillfactor:
SET maintenance_work_mem = '1MB';
CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10);
- WARNING: hash indexes are not WAL-logged and their use is discouraged
EXPLAIN (COSTS OFF)
SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
- QUERY PLAN
--------------------------------------------------------
- Aggregate
- -> Bitmap Heap Scan on tenk1
- Recheck Cond: (stringu1 = 'TVAAAA'::name)
- -> Bitmap Index Scan on hash_tuplesort_idx
- Index Cond: (stringu1 = 'TVAAAA'::name)
-(5 rows)
+ QUERY PLAN
+-------------------------------------------------------------------
+ Finalize Aggregate
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Partial Aggregate
+ -> Bitmap Heap Scan on tenk1
+ Recheck Cond: (stringu1 = 'TVAAAA'::name)
+ -> Bitmap Index Scan on hash_tuplesort_idx
+ Index Cond: (stringu1 = 'TVAAAA'::name)
+(7 rows)
SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
count
DELETE FROM concur_heap WHERE f1 = 'b';
VACUUM FULL concur_heap;
\d concur_heap
- Table "public.concur_heap"
- Column | Type | Modifiers
- --------+------+-----------
- f1 | text |
- f2 | text |
+ Table "public.concur_heap"
+ Column | Type | Collation | Nullable | Default
+ --------+------+-----------+----------+---------
+ f1 | text | | |
+ f2 | text | | |
Indexes:
- "concur_index2" UNIQUE, btree (f1)
- "concur_index3" UNIQUE, btree (f2) INVALID
- "concur_heap_expr_idx" btree ((f2 || f1))
- "concur_index1" btree (f2, f1)
- "concur_index4" btree (f2) WHERE f1 = 'a'::text
- "concur_index5" btree (f2) WHERE f1 = 'x'::text
"std_index" btree (f2)
REINDEX TABLE concur_heap;
\d concur_heap
- Table "public.concur_heap"
- Column | Type | Modifiers
- --------+------+-----------
- f1 | text |
- f2 | text |
+ Table "public.concur_heap"
+ Column | Type | Collation | Nullable | Default
+ --------+------+-----------+----------+---------
+ f1 | text | | |
+ f2 | text | | |
Indexes:
- "concur_index2" UNIQUE, btree (f1)
- "concur_index3" UNIQUE, btree (f2)
- "concur_heap_expr_idx" btree ((f2 || f1))
- "concur_index1" btree (f2, f1)
- "concur_index4" btree (f2) WHERE f1 = 'a'::text
- "concur_index5" btree (f2) WHERE f1 = 'x'::text
"std_index" btree (f2)
--
ROLLBACK;
-- successes
DROP INDEX CONCURRENTLY IF EXISTS "concur_index3";
+NOTICE: index "concur_index3" does not exist, skipping
+ERROR: DROP INDEX CONCURRENTLY cannot run inside a transaction block
DROP INDEX CONCURRENTLY "concur_index4";
+ERROR: index "concur_index4" does not exist
DROP INDEX CONCURRENTLY "concur_index5";
+ERROR: index "concur_index5" does not exist
DROP INDEX CONCURRENTLY "concur_index1";
+ERROR: index "concur_index1" does not exist
DROP INDEX CONCURRENTLY "concur_heap_expr_idx";
+ERROR: index "concur_heap_expr_idx" does not exist
\d concur_heap
- Table "public.concur_heap"
- Column | Type | Modifiers
- --------+------+-----------
- f1 | text |
- f2 | text |
+ Table "public.concur_heap"
+ Column | Type | Collation | Nullable | Default
+ --------+------+-----------+----------+---------
+ f1 | text | | |
+ f2 | text | | |
Indexes:
"std_index" btree (f2)
--
explain (costs off)
select * from tenk1 where (thousand, tenthous) in ((1,1001), (null,null));
- QUERY PLAN
-------------------------------------------------------
- Index Scan using tenk1_thous_tenthous on tenk1
- Index Cond: ((thousand = 1) AND (tenthous = 1001))
-(2 rows)
+ QUERY PLAN
+------------------------------------------------------------
+ Remote Fast Query Execution
+ Node/s: datanode_1, datanode_2
+ -> Index Scan using tenk1_thous_tenthous on tenk1
+ Index Cond: ((thousand = 1) AND (tenthous = 1001))
+(4 rows)
+ --
+ -- Check matching of boolean index columns to WHERE conditions and sort keys
+ --
+ create temp table boolindex (b bool, i int, unique(b, i), junk float);
+ explain (costs off)
+ select * from boolindex order by b, i limit 10;
+ QUERY PLAN
+ -------------------------------------------------------
+ Limit
+ -> Index Scan using boolindex_b_i_key on boolindex
+ (2 rows)
+
+ explain (costs off)
+ select * from boolindex where b order by i limit 10;
+ QUERY PLAN
+ -------------------------------------------------------
+ Limit
+ -> Index Scan using boolindex_b_i_key on boolindex
+ Index Cond: (b = true)
+ Filter: b
+ (4 rows)
+
+ explain (costs off)
+ select * from boolindex where b = true order by i desc limit 10;
+ QUERY PLAN
+ ----------------------------------------------------------------
+ Limit
+ -> Index Scan Backward using boolindex_b_i_key on boolindex
+ Index Cond: (b = true)
+ Filter: b
+ (4 rows)
+
+ explain (costs off)
+ select * from boolindex where not b order by i limit 10;
+ QUERY PLAN
+ -------------------------------------------------------
+ Limit
+ -> Index Scan using boolindex_b_i_key on boolindex
+ Index Cond: (b = false)
+ Filter: (NOT b)
+ (4 rows)
+
--
-- REINDEX (VERBOSE)
--
(2 rows)
DROP TABLE inhg;
-CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
+ CREATE TABLE test_like_id_1 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ \d test_like_id_1
+ Table "public.test_like_id_1"
+ Column | Type | Collation | Nullable | Default
+ --------+---------+-----------+----------+------------------------------
+ a | integer | | not null | generated always as identity
+ b | text | | |
+
+ INSERT INTO test_like_id_1 (b) VALUES ('b1');
+ SELECT * FROM test_like_id_1;
+ a | b
+ ---+----
+ 1 | b1
+ (1 row)
+
+ CREATE TABLE test_like_id_2 (LIKE test_like_id_1);
+ \d test_like_id_2
+ Table "public.test_like_id_2"
+ Column | Type | Collation | Nullable | Default
+ --------+---------+-----------+----------+---------
+ a | integer | | not null |
+ b | text | | |
+
+ INSERT INTO test_like_id_2 (b) VALUES ('b2');
+ ERROR: null value in column "a" violates not-null constraint
+ DETAIL: Failing row contains (null, b2).
+ SELECT * FROM test_like_id_2; -- identity was not copied
+ a | b
+ ---+---
+ (0 rows)
+
+ CREATE TABLE test_like_id_3 (LIKE test_like_id_1 INCLUDING IDENTITY);
+ \d test_like_id_3
+ Table "public.test_like_id_3"
+ Column | Type | Collation | Nullable | Default
+ --------+---------+-----------+----------+------------------------------
+ a | integer | | not null | generated always as identity
+ b | text | | |
+
+ INSERT INTO test_like_id_3 (b) VALUES ('b3');
+ SELECT * FROM test_like_id_3; -- identity was copied and applied
+ a | b
+ ---+----
+ 1 | b3
+ (1 row)
+
+ DROP TABLE test_like_id_1, test_like_id_2, test_like_id_3;
+CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text) DISTRIBUTE BY REPLICATION; /* copies indexes */
INSERT INTO inhg VALUES (5, 10);
INSERT INTO inhg VALUES (20, 10); -- should fail
ERROR: duplicate key value violates unique constraint "inhg_pkey"
union all
select ff + 4 as x from ec1) as ss2
where ss1.x = ec1.f1 and ss1.x = ss2.x and ec1.ff = 42::int8;
- QUERY PLAN
----------------------------------------------------------------------
+ QUERY PLAN
+-----------------------------------------------------------------------------
Nested Loop
- -> Nested Loop
- -> Index Scan using ec1_pkey on ec1
- Index Cond: (ff = '42'::bigint)
+ Join Filter: (x = x)
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
-> Append
- -> Index Scan using ec1_expr2 on ec1 ec1_1
- Index Cond: (((ff + 2) + 1) = ec1.f1)
- -> Index Scan using ec1_expr3 on ec1 ec1_2
- Index Cond: (((ff + 3) + 1) = ec1.f1)
- -> Index Scan using ec1_expr4 on ec1 ec1_3
- Index Cond: ((ff + 4) = ec1.f1)
- -> Append
- -> Index Scan using ec1_expr2 on ec1 ec1_4
- Index Cond: (((ff + 2) + 1) = (((ec1_1.ff + 2) + 1)))
- -> Index Scan using ec1_expr3 on ec1 ec1_5
- Index Cond: (((ff + 3) + 1) = (((ec1_1.ff + 2) + 1)))
- -> Index Scan using ec1_expr4 on ec1 ec1_6
- Index Cond: ((ff + 4) = (((ec1_1.ff + 2) + 1)))
-(18 rows)
-
--- let's try that as a mergejoin
-set enable_mergejoin = on;
-set enable_nestloop = off;
-explain (costs off)
- select * from ec1,
- (select ff + 1 as x from
- (select ff + 2 as ff from ec1
- union all
- select ff + 3 as ff from ec1) ss0
- union all
- select ff + 4 as x from ec1) as ss1,
- (select ff + 1 as x from
- (select ff + 2 as ff from ec1
- union all
- select ff + 3 as ff from ec1) ss0
- union all
- select ff + 4 as x from ec1) as ss2
- where ss1.x = ec1.f1 and ss1.x = ss2.x and ec1.ff = 42::int8;
- QUERY PLAN
------------------------------------------------------------------
- Merge Join
- Merge Cond: ((((ec1_4.ff + 2) + 1)) = (((ec1_1.ff + 2) + 1)))
- -> Merge Append
- Sort Key: (((ec1_4.ff + 2) + 1))
- -> Index Scan using ec1_expr2 on ec1 ec1_4
- -> Index Scan using ec1_expr3 on ec1 ec1_5
- -> Index Scan using ec1_expr4 on ec1 ec1_6
+ -> Seq Scan on ec1 ec1_4
+ -> Seq Scan on ec1 ec1_5
+ -> Seq Scan on ec1 ec1_6
-> Materialize
- -> Merge Join
- Merge Cond: ((((ec1_1.ff + 2) + 1)) = ec1.f1)
- -> Merge Append
- Sort Key: (((ec1_1.ff + 2) + 1))
- -> Index Scan using ec1_expr2 on ec1 ec1_1
- -> Index Scan using ec1_expr3 on ec1 ec1_2
- -> Index Scan using ec1_expr4 on ec1 ec1_3
- -> Sort
- Sort Key: ec1.f1 USING <
+ -> Nested Loop
+ Join Filter: (x = ec1.f1)
+ -> Remote Subquery Scan on all (datanode_1)
-> Index Scan using ec1_pkey on ec1
Index Cond: (ff = '42'::bigint)
+ -> Materialize
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Append
+ -> Seq Scan on ec1 ec1_1
+ -> Seq Scan on ec1 ec1_2
+ -> Seq Scan on ec1 ec1_3
(19 rows)
- -- excluding as XL does not support complex queries
- -- with 'union all'
+-- let's try that as a mergejoin
+set enable_mergejoin = on;
+set enable_nestloop = off;
-- check partially indexed scan
set enable_nestloop = on;
set enable_mergejoin = off;
create event trigger regress_event_trigger2 on ddl_command_start
when tag in ('create table', 'CREATE FUNCTION')
execute procedure test_event_trigger();
+ERROR: EVENT TRIGGER not yet supported in Postgres-XL
-- OK
comment on event trigger regress_event_trigger is 'test comment';
- -- should fail, event triggers are not schema objects
- comment on event trigger wrong.regress_event_trigger is 'test comment';
- ERROR: event trigger name cannot be qualified
+ERROR: event trigger "regress_event_trigger" does not exist
-- drop as non-superuser should fail
create role regress_evt_user;
set role regress_evt_user;
DROP FOREIGN DATA WRAPPER IF EXISTS nonexistent;
NOTICE: foreign-data wrapper "nonexistent" does not exist, skipping
\dew+
- List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
-------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
- dummy | regress_foreign_data_user | - | - | | | useless
- foo | regress_test_role_super | - | - | | (b '3', c '4', a '2', d '5') |
- postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | |
-(3 rows)
+ List of foreign-data wrappers
+ Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
+------+-------+---------+-----------+-------------------+-------------+-------------
+(0 rows)
DROP ROLE regress_test_role_super; -- ERROR
-ERROR: role "regress_test_role_super" cannot be dropped because some objects depend on it
-DETAIL: owner of foreign-data wrapper foo
SET ROLE regress_test_role_super;
+ERROR: role "regress_test_role_super" does not exist
DROP FOREIGN DATA WRAPPER foo;
+ERROR: foreign-data wrapper "foo" does not exist
RESET ROLE;
DROP ROLE regress_test_role_super;
+ERROR: role "regress_test_role_super" does not exist
\dew+
- List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
-------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
- dummy | regress_foreign_data_user | - | - | | | useless
- postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | |
-(2 rows)
+ List of foreign-data wrappers
+ Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
+------+-------+---------+-----------+-------------------+-------------+-------------
+(0 rows)
CREATE FOREIGN DATA WRAPPER foo;
+ERROR: Postgres-XL does not support FOREIGN DATA WRAPPER yet
+DETAIL: The feature is not currently supported
CREATE SERVER s1 FOREIGN DATA WRAPPER foo;
+ERROR: Postgres-XL does not support SERVER yet
+DETAIL: The feature is not currently supported
COMMENT ON SERVER s1 IS 'foreign server';
+ERROR: server "s1" does not exist
CREATE USER MAPPING FOR current_user SERVER s1;
+ERROR: Postgres-XL does not support USER MAPPING yet
+DETAIL: The feature is not currently supported
+ CREATE USER MAPPING FOR current_user SERVER s1; -- ERROR
+ ERROR: user mapping for "regress_foreign_data_user" already exists for server s1
+ CREATE USER MAPPING IF NOT EXISTS FOR current_user SERVER s1; -- NOTICE
+ NOTICE: user mapping for "regress_foreign_data_user" already exists for server s1, skipping
\dew+
- List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
-------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
- dummy | regress_foreign_data_user | - | - | | | useless
- foo | regress_foreign_data_user | - | - | | |
- postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | |
-(3 rows)
+ List of foreign-data wrappers
+ Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
+------+-------+---------+-----------+-------------------+-------------+-------------
+(0 rows)
\des+
- List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW Options | Description
-------+---------------------------+----------------------+-------------------+------+---------+-------------+----------------
- s1 | regress_foreign_data_user | foo | | | | | foreign server
-(1 row)
+ List of foreign servers
+ Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW Options | Description
+------+-------+----------------------+-------------------+------+---------+-------------+-------------
+(0 rows)
\deu+
- List of user mappings
- Server | User name | FDW Options
---------+---------------------------+-------------
- s1 | regress_foreign_data_user |
-(1 row)
+ List of user mappings
+ Server | User name | FDW Options
+--------+-----------+-------------
+(0 rows)
DROP FOREIGN DATA WRAPPER foo; -- ERROR
-ERROR: cannot drop foreign-data wrapper foo because other objects depend on it
-DETAIL: server s1 depends on foreign-data wrapper foo
-user mapping for regress_foreign_data_user on server s1 depends on server s1
-HINT: Use DROP ... CASCADE to drop the dependent objects too.
+ERROR: foreign-data wrapper "foo" does not exist
SET ROLE regress_test_role;
DROP FOREIGN DATA WRAPPER foo CASCADE; -- ERROR
-ERROR: must be owner of foreign-data wrapper foo
+ERROR: foreign-data wrapper "foo" does not exist
RESET ROLE;
DROP FOREIGN DATA WRAPPER foo CASCADE;
-NOTICE: drop cascades to 2 other objects
-DETAIL: drop cascades to server s1
-drop cascades to user mapping for regress_foreign_data_user on server s1
+ERROR: foreign-data wrapper "foo" does not exist
\dew+
- List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
-------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
- dummy | regress_foreign_data_user | - | - | | | useless
- postgresql | regress_foreign_data_user | - | postgresql_fdw_validator | | |
-(2 rows)
+ List of foreign-data wrappers
+ Name | Owner | Handler | Validator | Access privileges | FDW Options | Description
+------+-------+---------+-----------+-------------------+-------------+-------------
+(0 rows)
\des+
List of foreign servers
ERROR: permission denied to alter foreign-data wrapper "foo"
HINT: Must be superuser to alter a foreign-data wrapper.
DROP FOREIGN DATA WRAPPER foo; -- ERROR
-ERROR: must be owner of foreign-data wrapper foo
+ERROR: foreign-data wrapper "foo" does not exist
GRANT USAGE ON FOREIGN DATA WRAPPER postgresql TO regress_test_role; -- WARNING
-WARNING: no privileges were granted for "postgresql"
+ERROR: foreign-data wrapper "postgresql" does not exist
GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;
+ERROR: foreign-data wrapper "foo" does not exist
CREATE SERVER s9 FOREIGN DATA WRAPPER postgresql;
+ERROR: Postgres-XL does not support SERVER yet
+DETAIL: The feature is not currently supported
ALTER SERVER s6 VERSION '0.5'; -- ERROR
-ERROR: must be owner of foreign server s6
+ERROR: server "s6" does not exist
DROP SERVER s6; -- ERROR
-ERROR: must be owner of foreign server s6
+ERROR: server "s6" does not exist
GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role; -- ERROR
-ERROR: permission denied for foreign server s6
+ERROR: server "s6" does not exist
GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
+ERROR: server "s9" does not exist
CREATE USER MAPPING FOR public SERVER s6; -- ERROR
-ERROR: must be owner of foreign server s6
+ERROR: Postgres-XL does not support USER MAPPING yet
+DETAIL: The feature is not currently supported
CREATE USER MAPPING FOR public SERVER s9;
+ERROR: Postgres-XL does not support USER MAPPING yet
+DETAIL: The feature is not currently supported
ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (gotcha 'true'); -- ERROR
-ERROR: must be owner of foreign server s6
+ERROR: server "s6" does not exist
DROP USER MAPPING FOR regress_test_role SERVER s6; -- ERROR
-ERROR: must be owner of foreign server s6
+ERROR: server "s6" does not exist
RESET ROLE;
REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role; -- ERROR
-ERROR: dependent privileges exist
-HINT: Use CASCADE to revoke them too.
+ERROR: foreign-data wrapper "foo" does not exist
REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role CASCADE;
+ERROR: foreign-data wrapper "foo" does not exist
SET ROLE regress_unprivileged_role;
GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role; -- ERROR
-ERROR: permission denied for foreign-data wrapper foo
+ERROR: foreign-data wrapper "foo" does not exist
CREATE SERVER s10 FOREIGN DATA WRAPPER foo; -- ERROR
-ERROR: permission denied for foreign-data wrapper foo
+ERROR: Postgres-XL does not support SERVER yet
+DETAIL: The feature is not currently supported
ALTER SERVER s9 VERSION '1.1';
+ERROR: server "s9" does not exist
GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
+ERROR: server "s9" does not exist
CREATE USER MAPPING FOR current_user SERVER s9;
+ERROR: Postgres-XL does not support USER MAPPING yet
+DETAIL: The feature is not currently supported
DROP SERVER s9 CASCADE;
-NOTICE: drop cascades to 2 other objects
-DETAIL: drop cascades to user mapping for public on server s9
-drop cascades to user mapping for regress_unprivileged_role on server s9
+ERROR: server "s9" does not exist
RESET ROLE;
CREATE SERVER s9 FOREIGN DATA WRAPPER foo;
+ERROR: Postgres-XL does not support SERVER yet
+DETAIL: The feature is not currently supported
GRANT USAGE ON FOREIGN SERVER s9 TO regress_unprivileged_role;
+ERROR: server "s9" does not exist
SET ROLE regress_unprivileged_role;
ALTER SERVER s9 VERSION '1.2'; -- ERROR
-ERROR: must be owner of foreign server s9
+ERROR: server "s9" does not exist
GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role; -- WARNING
-WARNING: no privileges were granted for "s9"
+ERROR: server "s9" does not exist
CREATE USER MAPPING FOR current_user SERVER s9;
+ERROR: Postgres-XL does not support USER MAPPING yet
+DETAIL: The feature is not currently supported
DROP SERVER s9 CASCADE; -- ERROR
+ERROR: server "s9" does not exist
+ ERROR: must be owner of foreign server s9
+ -- Check visibility of user mapping data
+ SET ROLE regress_test_role;
+ CREATE SERVER s10 FOREIGN DATA WRAPPER foo;
+ CREATE USER MAPPING FOR public SERVER s10 OPTIONS (user 'secret');
+ GRANT USAGE ON FOREIGN SERVER s10 TO regress_unprivileged_role;
+ -- owner of server can see option fields
+ \deu+
+ List of user mappings
+ Server | User name | FDW Options
+ --------+---------------------------+-------------------
+ s10 | public | ("user" 'secret')
+ s4 | regress_foreign_data_user |
+ s5 | regress_test_role | (modified '1')
+ s6 | regress_test_role |
+ s8 | public |
+ s8 | regress_foreign_data_user |
+ s9 | regress_unprivileged_role |
+ t1 | public | (modified '1')
+ (8 rows)
+
+ RESET ROLE;
+ -- superuser can see option fields
+ \deu+
+ List of user mappings
+ Server | User name | FDW Options
+ --------+---------------------------+---------------------
+ s10 | public | ("user" 'secret')
+ s4 | regress_foreign_data_user |
+ s5 | regress_test_role | (modified '1')
+ s6 | regress_test_role |
+ s8 | public |
+ s8 | regress_foreign_data_user | (password 'public')
+ s9 | regress_unprivileged_role |
+ t1 | public | (modified '1')
+ (8 rows)
+
+ -- unprivileged user cannot see option fields
+ SET ROLE regress_unprivileged_role;
+ \deu+
+ List of user mappings
+ Server | User name | FDW Options
+ --------+---------------------------+-------------
+ s10 | public |
+ s4 | regress_foreign_data_user |
+ s5 | regress_test_role |
+ s6 | regress_test_role |
+ s8 | public |
+ s8 | regress_foreign_data_user |
+ s9 | regress_unprivileged_role |
+ t1 | public |
+ (8 rows)
+
RESET ROLE;
+ DROP SERVER s10 CASCADE;
+ NOTICE: drop cascades to user mapping for public on server s10
-- Triggers
CREATE FUNCTION dummy_trigger() RETURNS TRIGGER AS $$
BEGIN
ON foreign_schema.foreign_table_1
FOR EACH STATEMENT
EXECUTE PROCEDURE dummy_trigger();
+ERROR: Postgres-XL does not support TRIGGER yet
+DETAIL: The feature is not currently supported
+ CREATE TRIGGER trigtest_after_stmt_tt AFTER INSERT OR UPDATE OR DELETE -- ERROR
+ ON foreign_schema.foreign_table_1
+ REFERENCING NEW TABLE AS new_table
+ FOR EACH STATEMENT
+ EXECUTE PROCEDURE dummy_trigger();
+ ERROR: "foreign_table_1" is a foreign table
+ DETAIL: Triggers on foreign tables cannot have transition tables.
CREATE TRIGGER trigtest_before_row BEFORE INSERT OR UPDATE OR DELETE
ON foreign_schema.foreign_table_1
FOR EACH ROW
);
CREATE FOREIGN TABLE ft2 () INHERITS (pt1)
SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ERROR: server "s0" does not exist
\d+ pt1
- Table "public.pt1"
- Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
---------+---------+-----------+----------+---------+----------+--------------+-------------
- c1 | integer | | not null | | plain | |
- c2 | text | | | | extended | |
- c3 | date | | | | plain | |
-Child tables: ft2
+ Table "public.pt1"
+ Column | Type | Modifiers | Storage | Stats target | Description
+--------+---------+-----------+----------+--------------+-------------
+ c1 | integer | not null | plain | |
+ c2 | text | | extended | |
+ c3 | date | | plain | |
+Distribute By: HASH(c1)
+Location Nodes: ALL DATANODES
\d+ ft2
- Foreign table "public.ft2"
- Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- c1 | integer | | not null | | | plain | |
- c2 | text | | | | | extended | |
- c3 | date | | | | | plain | |
-Server: s0
-FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
-Inherits: pt1
-
DROP FOREIGN TABLE ft2;
+ERROR: foreign table "ft2" does not exist
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- c1 | integer | not null | plain | |
- c2 | text | | extended | |
- c3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
+Distribute By: HASH(c1)
+Location Nodes: ALL DATANODES
CREATE FOREIGN TABLE ft2 (
c1 integer NOT NULL,
c2 text,
c3 date
) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ERROR: server "s0" does not exist
\d+ ft2
- Foreign table "public.ft2"
- Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- c1 | integer | | not null | | | plain | |
- c2 | text | | | | | extended | |
- c3 | date | | | | | plain | |
-Server: s0
-FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
-
ALTER FOREIGN TABLE ft2 INHERIT pt1;
+ERROR: relation "ft2" does not exist
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- c1 | integer | not null | plain | |
- c2 | text | | extended | |
- c3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
-Child tables: ft2
+Distribute By: HASH(c1)
+Location Nodes: ALL DATANODES
\d+ ft2
+ Foreign table "public.ft2"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | | | | extended | |
+ c3 | date | | | | | plain | |
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+ Inherits: pt1
+
CREATE TABLE ct3() INHERITS(ft2);
+ERROR: relation "ft2" does not exist
CREATE FOREIGN TABLE ft3 (
c1 integer NOT NULL,
c2 text,
c2 text,
c3 date
) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ERROR: server "s0" does not exist
-- child must have parent's INHERIT constraints
ALTER FOREIGN TABLE ft2 INHERIT pt1; -- ERROR
-ERROR: child table is missing constraint "pt1chk2"
+ERROR: relation "ft2" does not exist
ALTER FOREIGN TABLE ft2 ADD CONSTRAINT pt1chk2 CHECK (c2 <> '');
+ERROR: relation "ft2" does not exist
ALTER FOREIGN TABLE ft2 INHERIT pt1;
+ERROR: relation "ft2" does not exist
-- child does not inherit NO INHERIT constraints
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- c1 | integer | not null | plain | 10000 |
- c2 | text | | extended | |
- c3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | 10000 |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
Check constraints:
"pt1chk1" CHECK (c1 > 0) NO INHERIT
"pt1chk2" CHECK (c2 <> ''::text)
INSERT INTO pt1 VALUES (1, 'pt1'::text, '1994-01-01'::date);
ALTER TABLE pt1 ADD CONSTRAINT pt1chk3 CHECK (c2 <> '') NOT VALID;
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- c1 | integer | not null | plain | 10000 |
- c2 | text | | extended | |
- c3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | 10000 |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
Check constraints:
"pt1chk3" CHECK (c2 <> ''::text) NOT VALID
-Child tables: ft2
+Distribute By: HASH(c1)
+Location Nodes: ALL DATANODES
\d+ ft2
- Foreign table "public.ft2"
- Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- c1 | integer | | not null | | | plain | |
- c2 | text | | | | | extended | |
- c3 | date | | | | | plain | |
-Check constraints:
- "pt1chk2" CHECK (c2 <> ''::text)
- "pt1chk3" CHECK (c2 <> ''::text) NOT VALID
-Server: s0
-FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
-Inherits: pt1
-
-- VALIDATE CONSTRAINT need do nothing on foreign tables
ALTER TABLE pt1 VALIDATE CONSTRAINT pt1chk3;
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- c1 | integer | not null | plain | 10000 |
- c2 | text | | extended | |
- c3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | 10000 |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
Check constraints:
"pt1chk3" CHECK (c2 <> ''::text)
-Child tables: ft2
+Distribute By: HASH(c1)
+Location Nodes: ALL DATANODES
\d+ ft2
- Foreign table "public.ft2"
- Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- c1 | integer | | not null | | | plain | |
- c2 | text | | | | | extended | |
- c3 | date | | | | | plain | |
-Check constraints:
- "pt1chk2" CHECK (c2 <> ''::text)
- "pt1chk3" CHECK (c2 <> ''::text)
-Server: s0
-FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
-Inherits: pt1
-
-- OID system column
ALTER TABLE pt1 SET WITH OIDS;
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- c1 | integer | not null | plain | 10000 |
- c2 | text | | extended | |
- c3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | 10000 |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
Check constraints:
"pt1chk3" CHECK (c2 <> ''::text)
-Child tables: ft2
Has OIDs: yes
+Distribute By: HASH(c1)
+Location Nodes: ALL DATANODES
\d+ ft2
- Foreign table "public.ft2"
- Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- c1 | integer | | not null | | | plain | |
- c2 | text | | | | | extended | |
- c3 | date | | | | | plain | |
-Check constraints:
- "pt1chk2" CHECK (c2 <> ''::text)
- "pt1chk3" CHECK (c2 <> ''::text)
-Server: s0
-FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
-Inherits: pt1
-Has OIDs: yes
-
ALTER TABLE ft2 SET WITHOUT OIDS; -- ERROR
-ERROR: cannot drop inherited column "oid"
+ERROR: relation "ft2" does not exist
ALTER TABLE pt1 SET WITHOUT OIDS;
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- c1 | integer | not null | plain | 10000 |
- c2 | text | | extended | |
- c3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | 10000 |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
Check constraints:
"pt1chk3" CHECK (c2 <> ''::text)
-Child tables: ft2
+Distribute By: HASH(c1)
+Location Nodes: ALL DATANODES
\d+ ft2
- Foreign table "public.ft2"
- Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- c1 | integer | | not null | | | plain | |
- c2 | text | | | | | extended | |
- c3 | date | | | | | plain | |
-Check constraints:
- "pt1chk2" CHECK (c2 <> ''::text)
- "pt1chk3" CHECK (c2 <> ''::text)
-Server: s0
-FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
-Inherits: pt1
-
-- changes name of an attribute recursively
ALTER TABLE pt1 RENAME COLUMN c1 TO f1;
ALTER TABLE pt1 RENAME COLUMN c2 TO f2;
-- changes name of a constraint recursively
ALTER TABLE pt1 RENAME CONSTRAINT pt1chk3 TO f2_check;
\d+ pt1
- Table "public.pt1"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+----------+--------------+-------------
- f1 | integer | not null | plain | 10000 |
- f2 | text | | extended | |
- f3 | date | | plain | |
+ Table "public.pt1"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ f1 | integer | | not null | | plain | 10000 |
+ f2 | text | | | | extended | |
+ f3 | date | | | | plain | |
Check constraints:
"f2_check" CHECK (f2 <> ''::text)
-Child tables: ft2
+Distribute By: HASH(f1)
+Location Nodes: ALL DATANODES
\d+ ft2
- Foreign table "public.ft2"
- Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
---------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
- f1 | integer | | not null | | | plain | |
- f2 | text | | | | | extended | |
- f3 | date | | | | | plain | |
-Check constraints:
- "f2_check" CHECK (f2 <> ''::text)
- "pt1chk2" CHECK (f2 <> ''::text)
-Server: s0
-FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
-Inherits: pt1
-
-- TRUNCATE doesn't work on foreign tables, either directly or recursively
TRUNCATE ft2; -- ERROR
-ERROR: "ft2" is not a table
+ERROR: relation "ft2" does not exist
TRUNCATE pt1; -- ERROR
-ERROR: "ft2" is not a table
DROP TABLE pt1 CASCADE;
-NOTICE: drop cascades to foreign table ft2
-- IMPORT FOREIGN SCHEMA
IMPORT FOREIGN SCHEMA s1 FROM SERVER s9 INTO public; -- ERROR
-ERROR: foreign-data wrapper "foo" has no handler
+ERROR: server "s9" does not exist
IMPORT FOREIGN SCHEMA s1 LIMIT TO (t1) FROM SERVER s9 INTO public; --ERROR
-ERROR: foreign-data wrapper "foo" has no handler
+ERROR: server "s9" does not exist
IMPORT FOREIGN SCHEMA s1 EXCEPT (t1) FROM SERVER s9 INTO public; -- ERROR
-ERROR: foreign-data wrapper "foo" has no handler
+ERROR: server "s9" does not exist
IMPORT FOREIGN SCHEMA s1 EXCEPT (t1, t2) FROM SERVER s9 INTO public
OPTIONS (option1 'value1', option2 'value2'); -- ERROR
-ERROR: foreign-data wrapper "foo" has no handler
+ERROR: server "s9" does not exist
-- DROP FOREIGN TABLE
DROP FOREIGN TABLE no_table; -- ERROR
ERROR: foreign table "no_table" does not exist
-- REASSIGN OWNED/DROP OWNED of foreign objects
REASSIGN OWNED BY regress_test_role TO regress_test_role2;
DROP OWNED BY regress_test_role2;
-ERROR: cannot drop desired object(s) because other objects depend on them
-DETAIL: user mapping for regress_test_role on server s5 depends on server s5
-HINT: Use DROP ... CASCADE to drop the dependent objects too.
DROP OWNED BY regress_test_role2 CASCADE;
+ NOTICE: drop cascades to user mapping for regress_test_role on server s5
+ -- Foreign partition DDL stuff
+ CREATE TABLE pt2 (
+ c1 integer NOT NULL,
+ c2 text,
+ c3 date
+ ) PARTITION BY LIST (c1);
+ CREATE FOREIGN TABLE pt2_1 PARTITION OF pt2 FOR VALUES IN (1)
+ SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2
+ Table "public.pt2"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+
+ \d+ pt2_1
+ Foreign table "public.pt2_1"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | | | | extended | |
+ c3 | date | | | | | plain | |
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+
+ -- partition cannot have additional columns
+ DROP FOREIGN TABLE pt2_1;
+ CREATE FOREIGN TABLE pt2_1 (
+ c1 integer NOT NULL,
+ c2 text,
+ c3 date,
+ c4 char
+ ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2_1
+ Foreign table "public.pt2_1"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+--------------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | | | | extended | |
+ c3 | date | | | | | plain | |
+ c4 | character(1) | | | | | extended | |
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -- ERROR
+ ERROR: table "pt2_1" contains column "c4" not found in parent "pt2"
+ DETAIL: New partition should contain only the columns present in parent.
+ DROP FOREIGN TABLE pt2_1;
+ \d+ pt2
+ Table "public.pt2"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
+ Partition key: LIST (c1)
+
+ CREATE FOREIGN TABLE pt2_1 (
+ c1 integer NOT NULL,
+ c2 text,
+ c3 date
+ ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
+ \d+ pt2_1
+ Foreign table "public.pt2_1"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | | | | extended | |
+ c3 | date | | | | | plain | |
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+
+ -- no attach partition validation occurs for foreign tables
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ \d+ pt2
+ Table "public.pt2"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+
+ \d+ pt2_1
+ Foreign table "public.pt2_1"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | | | | extended | |
+ c3 | date | | | | | plain | |
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+
+ -- cannot add column to a partition
+ ALTER TABLE pt2_1 ADD c4 char;
+ ERROR: cannot add column to a partition
+ -- ok to have a partition's own constraints though
+ ALTER TABLE pt2_1 ALTER c3 SET NOT NULL;
+ ALTER TABLE pt2_1 ADD CONSTRAINT p21chk CHECK (c2 <> '');
+ \d+ pt2
+ Table "public.pt2"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | | | extended | |
+ c3 | date | | | | plain | |
+ Partition key: LIST (c1)
+ Partitions: pt2_1 FOR VALUES IN (1)
+
+ \d+ pt2_1
+ Foreign table "public.pt2_1"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | | | | extended | |
+ c3 | date | | not null | | | plain | |
+ Partition of: pt2 FOR VALUES IN (1)
+ Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+ Check constraints:
+ "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+
+ -- cannot drop inherited NOT NULL constraint from a partition
+ ALTER TABLE pt2_1 ALTER c1 DROP NOT NULL;
+ ERROR: column "c1" is marked NOT NULL in parent table
+ -- partition must have parent's constraints
+ ALTER TABLE pt2 DETACH PARTITION pt2_1;
+ ALTER TABLE pt2 ALTER c2 SET NOT NULL;
+ \d+ pt2
+ Table "public.pt2"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | not null | | extended | |
+ c3 | date | | | | plain | |
+ Partition key: LIST (c1)
+
+ \d+ pt2_1
+ Foreign table "public.pt2_1"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | | | | extended | |
+ c3 | date | | not null | | | plain | |
+ Check constraints:
+ "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -- ERROR
+ ERROR: column "c2" in child table must be marked NOT NULL
+ ALTER FOREIGN TABLE pt2_1 ALTER c2 SET NOT NULL;
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ ALTER TABLE pt2 DETACH PARTITION pt2_1;
+ ALTER TABLE pt2 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
+ \d+ pt2
+ Table "public.pt2"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+----------+--------------+-------------
+ c1 | integer | | not null | | plain | |
+ c2 | text | | not null | | extended | |
+ c3 | date | | | | plain | |
+ Partition key: LIST (c1)
+ Check constraints:
+ "pt2chk1" CHECK (c1 > 0)
+
+ \d+ pt2_1
+ Foreign table "public.pt2_1"
+ Column | Type | Collation | Nullable | Default | FDW Options | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1 | integer | | not null | | | plain | |
+ c2 | text | | not null | | | extended | |
+ c3 | date | | not null | | | plain | |
+ Check constraints:
+ "p21chk" CHECK (c2 <> ''::text)
+ Server: s0
+ FDW Options: (delimiter ',', quote '"', "be quoted" 'value')
+
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1); -- ERROR
+ ERROR: child table is missing constraint "pt2chk1"
+ ALTER FOREIGN TABLE pt2_1 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
+ ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
+ -- TRUNCATE doesn't work on foreign tables, either directly or recursively
+ TRUNCATE pt2_1; -- ERROR
+ ERROR: "pt2_1" is not a table
+ TRUNCATE pt2; -- ERROR
+ ERROR: "pt2_1" is not a table
+ DROP FOREIGN TABLE pt2_1;
+ DROP TABLE pt2;
-- Cleanup
DROP SCHEMA foreign_schema CASCADE;
DROP ROLE regress_test_role; -- ERROR
-- Test interaction of foreign-key optimization with rules (bug #14219)
--
create temp table t1 (a integer primary key, b text);
-create temp table t2 (a integer primary key, b integer references t1);
+create temp table t2 (a integer, b integer references t1) distribute by hash (b);
create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a;
explain (costs off) delete from t1 where a = 1;
+ERROR: could not plan this distributed delete
+DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL.
+delete from t1 where a = 1;
+ERROR: could not plan this distributed delete
+DETAIL: correlated or complex DELETE is currently not supported in Postgres-XL.
+drop rule r1 on t1;
+explain (costs off, nodes off) delete from t1 where a = 1;
QUERY PLAN
--------------------------------------------
- Delete on t2
- -> Nested Loop
+ Remote Fast Query Execution
+ -> Delete on t1
-> Index Scan using t1_pkey on t1
Index Cond: (a = 1)
- -> Seq Scan on t2
- Filter: (b = 1)
-
- Delete on t1
- -> Index Scan using t1_pkey on t1
- Index Cond: (a = 1)
-(10 rows)
+(4 rows)
delete from t1 where a = 1;
+ --
+ -- Test deferred FK check on a tuple deleted by a rolled-back subtransaction
+ --
+ create table pktable2(f1 int primary key);
+ create table fktable2(f1 int references pktable2 deferrable initially deferred);
+ insert into pktable2 values(1);
+ begin;
+ insert into fktable2 values(1);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit;
+ begin;
+ insert into fktable2 values(2);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit; -- fail
+ ERROR: insert or update on table "fktable2" violates foreign key constraint "fktable2_f1_fkey"
+ DETAIL: Key (f1)=(2) is not present in table "pktable2".
+ --
+ -- Test that we prevent dropping FK constraint with pending trigger events
+ --
+ begin;
+ insert into fktable2 values(2);
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ ERROR: cannot ALTER TABLE "fktable2" because it has pending trigger events
+ commit;
+ begin;
+ delete from pktable2 where f1 = 1;
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ ERROR: cannot ALTER TABLE "pktable2" because it has pending trigger events
+ commit;
+ drop table pktable2, fktable2;
ERROR: aggregate functions are not allowed in FROM clause of their own query level
LINE 3: lateral (select a, b, sum(v.x) from gstest_data(v.x) ...
^
- -- min max optimisation should still work with GROUP BY ()
+ -- min max optimization should still work with GROUP BY ()
explain (costs off)
select min(unique1) from tenk1 GROUP BY ();
- QUERY PLAN
-------------------------------------------------------------
+ QUERY PLAN
+------------------------------------------------------------------------
Result
InitPlan 1 (returns $0)
-> Limit
substring
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(two-compressed,1,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
+ ("one-toasted,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
(two-toasted,1,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345
("one-compressed,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- ("one-toasted,one-null",1,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
(4 rows)
- -- modification without modifying asigned value
+ -- modification without modifying assigned value
UPDATE toasttest SET cnt = cnt +1, f1 = f1 RETURNING substring(toasttest::text, 1, 200);
substring
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
substring
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(two-compressed,4,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
+ ("one-toasted,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
(two-toasted,4,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234
("one-compressed,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- ("one-toasted,one-null",4,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
(4 rows)
- -- check we didn't screw with main/toast tuple visiblity
+ -- check we didn't screw with main/toast tuple visibility
VACUUM FREEZE toasttest;
SELECT substring(toasttest::text, 1, 200) FROM toasttest;
substring
substring
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(two-compressed,5,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
+ ("one-toasted,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
(two-toasted,5,-1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234
("one-compressed,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- ("one-toasted,one-null",5,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
(4 rows)
- -- modification without modifying asigned value
+ -- modification without modifying assigned value
UPDATE toasttest SET cnt = cnt +1, f1 = f1 RETURNING substring(toasttest::text, 1, 200);
substring
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
substring
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(two-compressed,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
- (two-toasted,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
- ("one-compressed,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
("one-toasted,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
("one-toasted,one-null, via indirect",0,1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
+ (two-toasted,8,--123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
+ ("one-compressed,one-null",8,,12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
(5 rows)
- -- check we didn't screw with main/toast tuple visiblity
+ -- check we didn't screw with main/toast tuple visibility
VACUUM FREEZE toasttest;
SELECT substring(toasttest::text, 1, 200) FROM toasttest;
substring
SET enable_seqscan TO on;
DROP INDEX inet_idx2;
+ -- check that spgist index works correctly
+ CREATE INDEX inet_idx3 ON inet_tbl using spgist (i);
+ SET enable_seqscan TO off;
+ SELECT * FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ ----------------+------------------
+ 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.0/24 | 192.168.1.255/25
+ 192.168.1.0/26 | 192.168.1.226
+ (3 rows)
+
+ SELECT * FROM inet_tbl WHERE i <<= '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ ----------------+------------------
+ 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.0/24 | 192.168.1.226/24
+ 192.168.1.0/24 | 192.168.1.255/24
+ 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.0/24 | 192.168.1.255/25
+ 192.168.1.0/26 | 192.168.1.226
+ (6 rows)
+
+ SELECT * FROM inet_tbl WHERE i && '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ ----------------+------------------
+ 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.0/24 | 192.168.1.226/24
+ 192.168.1.0/24 | 192.168.1.255/24
+ 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.0/24 | 192.168.1.255/25
+ 192.168.1.0/26 | 192.168.1.226
+ (6 rows)
+
+ SELECT * FROM inet_tbl WHERE i >>= '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ ----------------+------------------
+ 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.0/24 | 192.168.1.226/24
+ 192.168.1.0/24 | 192.168.1.255/24
+ (3 rows)
+
+ SELECT * FROM inet_tbl WHERE i >> '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ ---+---
+ (0 rows)
+
+ SELECT * FROM inet_tbl WHERE i < '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ -------------+-------------
+ 10.0.0.0/8 | 9.1.2.3/8
+ 10.0.0.0/32 | 10.1.2.3/8
+ 10.0.0.0/8 | 10.1.2.3/8
+ 10.0.0.0/8 | 10.1.2.3/8
+ 10.1.0.0/16 | 10.1.2.3/16
+ 10.1.2.0/24 | 10.1.2.3/24
+ 10.1.2.3/32 | 10.1.2.3
+ 10.0.0.0/8 | 11.1.2.3/8
+ (8 rows)
+
+ SELECT * FROM inet_tbl WHERE i <= '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ ----------------+----------------
+ 10.0.0.0/8 | 9.1.2.3/8
+ 10.0.0.0/8 | 10.1.2.3/8
+ 10.0.0.0/32 | 10.1.2.3/8
+ 10.0.0.0/8 | 10.1.2.3/8
+ 10.1.0.0/16 | 10.1.2.3/16
+ 10.1.2.0/24 | 10.1.2.3/24
+ 10.1.2.3/32 | 10.1.2.3
+ 10.0.0.0/8 | 11.1.2.3/8
+ 192.168.1.0/24 | 192.168.1.0/24
+ (9 rows)
+
+ SELECT * FROM inet_tbl WHERE i = '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ ----------------+----------------
+ 192.168.1.0/24 | 192.168.1.0/24
+ (1 row)
+
+ SELECT * FROM inet_tbl WHERE i >= '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ --------------------+------------------
+ 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.0/24 | 192.168.1.226/24
+ 192.168.1.0/24 | 192.168.1.255/24
+ 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.0/24 | 192.168.1.255/25
+ 192.168.1.0/26 | 192.168.1.226
+ ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+ 10:23::f1/128 | 10:23::f1/64
+ 10:23::8000/113 | 10:23::ffff
+ (9 rows)
+
+ SELECT * FROM inet_tbl WHERE i > '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ --------------------+------------------
+ 192.168.1.0/24 | 192.168.1.226/24
+ 192.168.1.0/24 | 192.168.1.255/24
+ 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.0/24 | 192.168.1.255/25
+ 192.168.1.0/26 | 192.168.1.226
+ ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+ 10:23::f1/128 | 10:23::f1/64
+ 10:23::8000/113 | 10:23::ffff
+ (8 rows)
+
+ SELECT * FROM inet_tbl WHERE i <> '192.168.1.0/24'::cidr ORDER BY i;
+ c | i
+ --------------------+------------------
+ 10.0.0.0/8 | 9.1.2.3/8
+ 10.0.0.0/8 | 10.1.2.3/8
+ 10.0.0.0/32 | 10.1.2.3/8
+ 10.0.0.0/8 | 10.1.2.3/8
+ 10.1.0.0/16 | 10.1.2.3/16
+ 10.1.2.0/24 | 10.1.2.3/24
+ 10.1.2.3/32 | 10.1.2.3
+ 10.0.0.0/8 | 11.1.2.3/8
+ 192.168.1.0/24 | 192.168.1.226/24
+ 192.168.1.0/24 | 192.168.1.255/24
+ 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.0/24 | 192.168.1.255/25
+ 192.168.1.0/26 | 192.168.1.226
+ ::ffff:1.2.3.4/128 | ::4.3.2.1/24
+ 10:23::f1/128 | 10:23::f1/64
+ 10:23::8000/113 | 10:23::ffff
+ (16 rows)
+
+ -- test index-only scans
+ EXPLAIN (COSTS OFF)
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ QUERY PLAN
+ ---------------------------------------------------
+ Sort
+ Sort Key: i
+ -> Index Only Scan using inet_idx3 on inet_tbl
+ Index Cond: (i << '192.168.1.0/24'::inet)
+ (4 rows)
+
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ i
+ ------------------
+ 192.168.1.0/25
+ 192.168.1.255/25
+ 192.168.1.226
+ (3 rows)
+
+ SET enable_seqscan TO on;
+ DROP INDEX inet_idx3;
-- simple tests of inet boolean and arithmetic operators
-SELECT i, ~i AS "~i" FROM inet_tbl;
+SELECT i, ~i AS "~i" FROM inet_tbl ORDER BY i;
i | ~i
------------------+--------------------------------------------
- 192.168.1.226/24 | 63.87.254.29/24
- 192.168.1.226 | 63.87.254.29
- 192.168.1.0/24 | 63.87.254.255/24
- 192.168.1.0/25 | 63.87.254.255/25
- 192.168.1.255/24 | 63.87.254.0/24
- 192.168.1.255/25 | 63.87.254.0/25
+ 9.1.2.3/8 | 246.254.253.252/8
10.1.2.3/8 | 245.254.253.252/8
10.1.2.3/8 | 245.254.253.252/8
- 10.1.2.3 | 245.254.253.252
- 10.1.2.3/24 | 245.254.253.252/24
- 10.1.2.3/16 | 245.254.253.252/16
10.1.2.3/8 | 245.254.253.252/8
+ 10.1.2.3/16 | 245.254.253.252/16
+ 10.1.2.3/24 | 245.254.253.252/24
+ 10.1.2.3 | 245.254.253.252
11.1.2.3/8 | 244.254.253.252/8
- 9.1.2.3/8 | 246.254.253.252/8
+ 192.168.1.0/24 | 63.87.254.255/24
+ 192.168.1.226/24 | 63.87.254.29/24
+ 192.168.1.255/24 | 63.87.254.0/24
+ 192.168.1.0/25 | 63.87.254.255/25
+ 192.168.1.255/25 | 63.87.254.0/25
+ 192.168.1.226 | 63.87.254.29
+ ::4.3.2.1/24 | ffff:ffff:ffff:ffff:ffff:ffff:fbfc:fdfe/24
10:23::f1/64 | ffef:ffdc:ffff:ffff:ffff:ffff:ffff:ff0e/64
10:23::ffff | ffef:ffdc:ffff:ffff:ffff:ffff:ffff:0
- ::4.3.2.1/24 | ffff:ffff:ffff:ffff:ffff:ffff:fbfc:fdfe/24
(17 rows)
-SELECT i, c, i & c AS "and" FROM inet_tbl;
+SELECT i, c, i & c AS "and" FROM inet_tbl ORDER BY i, c;
i | c | and
------------------+--------------------+----------------
- 192.168.1.226/24 | 192.168.1.0/24 | 192.168.1.0/24
- 192.168.1.226 | 192.168.1.0/26 | 192.168.1.0
- 192.168.1.0/24 | 192.168.1.0/24 | 192.168.1.0/24
- 192.168.1.0/25 | 192.168.1.0/24 | 192.168.1.0/25
- 192.168.1.255/24 | 192.168.1.0/24 | 192.168.1.0/24
- 192.168.1.255/25 | 192.168.1.0/24 | 192.168.1.0/25
+ 9.1.2.3/8 | 10.0.0.0/8 | 8.0.0.0/8
+ 10.1.2.3/8 | 10.0.0.0/8 | 10.0.0.0/8
10.1.2.3/8 | 10.0.0.0/8 | 10.0.0.0/8
10.1.2.3/8 | 10.0.0.0/32 | 10.0.0.0
- 10.1.2.3 | 10.1.2.3/32 | 10.1.2.3
- 10.1.2.3/24 | 10.1.2.0/24 | 10.1.2.0/24
10.1.2.3/16 | 10.1.0.0/16 | 10.1.0.0/16
- 10.1.2.3/8 | 10.0.0.0/8 | 10.0.0.0/8
+ 10.1.2.3/24 | 10.1.2.0/24 | 10.1.2.0/24
+ 10.1.2.3 | 10.1.2.3/32 | 10.1.2.3
11.1.2.3/8 | 10.0.0.0/8 | 10.0.0.0/8
- 9.1.2.3/8 | 10.0.0.0/8 | 8.0.0.0/8
+ 192.168.1.0/24 | 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.226/24 | 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.255/24 | 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.0/25 | 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.255/25 | 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.226 | 192.168.1.0/26 | 192.168.1.0
+ ::4.3.2.1/24 | ::ffff:1.2.3.4/128 | ::0.2.2.0
10:23::f1/64 | 10:23::f1/128 | 10:23::f1
10:23::ffff | 10:23::8000/113 | 10:23::8000
- ::4.3.2.1/24 | ::ffff:1.2.3.4/128 | ::0.2.2.0
(17 rows)
-SELECT i, c, i | c AS "or" FROM inet_tbl;
+SELECT i, c, i | c AS "or" FROM inet_tbl ORDER BY i, c;
i | c | or
------------------+--------------------+------------------
- 192.168.1.226/24 | 192.168.1.0/24 | 192.168.1.226/24
- 192.168.1.226 | 192.168.1.0/26 | 192.168.1.226
- 192.168.1.0/24 | 192.168.1.0/24 | 192.168.1.0/24
- 192.168.1.0/25 | 192.168.1.0/24 | 192.168.1.0/25
- 192.168.1.255/24 | 192.168.1.0/24 | 192.168.1.255/24
- 192.168.1.255/25 | 192.168.1.0/24 | 192.168.1.255/25
+ 9.1.2.3/8 | 10.0.0.0/8 | 11.1.2.3/8
+ 10.1.2.3/8 | 10.0.0.0/8 | 10.1.2.3/8
10.1.2.3/8 | 10.0.0.0/8 | 10.1.2.3/8
10.1.2.3/8 | 10.0.0.0/32 | 10.1.2.3
- 10.1.2.3 | 10.1.2.3/32 | 10.1.2.3
- 10.1.2.3/24 | 10.1.2.0/24 | 10.1.2.3/24
10.1.2.3/16 | 10.1.0.0/16 | 10.1.2.3/16
- 10.1.2.3/8 | 10.0.0.0/8 | 10.1.2.3/8
+ 10.1.2.3/24 | 10.1.2.0/24 | 10.1.2.3/24
+ 10.1.2.3 | 10.1.2.3/32 | 10.1.2.3
11.1.2.3/8 | 10.0.0.0/8 | 11.1.2.3/8
- 9.1.2.3/8 | 10.0.0.0/8 | 11.1.2.3/8
+ 192.168.1.0/24 | 192.168.1.0/24 | 192.168.1.0/24
+ 192.168.1.226/24 | 192.168.1.0/24 | 192.168.1.226/24
+ 192.168.1.255/24 | 192.168.1.0/24 | 192.168.1.255/24
+ 192.168.1.0/25 | 192.168.1.0/24 | 192.168.1.0/25
+ 192.168.1.255/25 | 192.168.1.0/24 | 192.168.1.255/25
+ 192.168.1.226 | 192.168.1.0/26 | 192.168.1.226
+ ::4.3.2.1/24 | ::ffff:1.2.3.4/128 | ::ffff:5.3.3.5
10:23::f1/64 | 10:23::f1/128 | 10:23::f1
10:23::ffff | 10:23::8000/113 | 10:23::ffff
- ::4.3.2.1/24 | ::ffff:1.2.3.4/128 | ::ffff:5.3.3.5
(17 rows)
-SELECT i, i + 500 AS "i+500" FROM inet_tbl;
+SELECT i, i + 500 AS "i+500" FROM inet_tbl ORDER BY i;
i | i+500
------------------+------------------
- 192.168.1.226/24 | 192.168.3.214/24
- 192.168.1.226 | 192.168.3.214
- 192.168.1.0/24 | 192.168.2.244/24
- 192.168.1.0/25 | 192.168.2.244/25
- 192.168.1.255/24 | 192.168.3.243/24
- 192.168.1.255/25 | 192.168.3.243/25
+ 9.1.2.3/8 | 9.1.3.247/8
10.1.2.3/8 | 10.1.3.247/8
10.1.2.3/8 | 10.1.3.247/8
- 10.1.2.3 | 10.1.3.247
- 10.1.2.3/24 | 10.1.3.247/24
- 10.1.2.3/16 | 10.1.3.247/16
10.1.2.3/8 | 10.1.3.247/8
+ 10.1.2.3/16 | 10.1.3.247/16
+ 10.1.2.3/24 | 10.1.3.247/24
+ 10.1.2.3 | 10.1.3.247
11.1.2.3/8 | 11.1.3.247/8
- 9.1.2.3/8 | 9.1.3.247/8
+ 192.168.1.0/24 | 192.168.2.244/24
+ 192.168.1.226/24 | 192.168.3.214/24
+ 192.168.1.255/24 | 192.168.3.243/24
+ 192.168.1.0/25 | 192.168.2.244/25
+ 192.168.1.255/25 | 192.168.3.243/25
+ 192.168.1.226 | 192.168.3.214
+ ::4.3.2.1/24 | ::4.3.3.245/24
10:23::f1/64 | 10:23::2e5/64
10:23::ffff | 10:23::1:1f3
- ::4.3.2.1/24 | ::4.3.3.245/24
(17 rows)
-SELECT i, i - 500 AS "i-500" FROM inet_tbl;
+SELECT i, i - 500 AS "i-500" FROM inet_tbl ORDER BY i;
i | i-500
------------------+----------------------------------------
- 192.168.1.226/24 | 192.167.255.238/24
- 192.168.1.226 | 192.167.255.238
- 192.168.1.0/24 | 192.167.255.12/24
- 192.168.1.0/25 | 192.167.255.12/25
- 192.168.1.255/24 | 192.168.0.11/24
- 192.168.1.255/25 | 192.168.0.11/25
+ 9.1.2.3/8 | 9.1.0.15/8
10.1.2.3/8 | 10.1.0.15/8
10.1.2.3/8 | 10.1.0.15/8
- 10.1.2.3 | 10.1.0.15
- 10.1.2.3/24 | 10.1.0.15/24
- 10.1.2.3/16 | 10.1.0.15/16
10.1.2.3/8 | 10.1.0.15/8
+ 10.1.2.3/16 | 10.1.0.15/16
+ 10.1.2.3/24 | 10.1.0.15/24
+ 10.1.2.3 | 10.1.0.15
11.1.2.3/8 | 11.1.0.15/8
- 9.1.2.3/8 | 9.1.0.15/8
+ 192.168.1.0/24 | 192.167.255.12/24
+ 192.168.1.226/24 | 192.167.255.238/24
+ 192.168.1.255/24 | 192.168.0.11/24
+ 192.168.1.0/25 | 192.167.255.12/25
+ 192.168.1.255/25 | 192.168.0.11/25
+ 192.168.1.226 | 192.167.255.238
+ ::4.3.2.1/24 | ::4.3.0.13/24
10:23::f1/64 | 10:22:ffff:ffff:ffff:ffff:ffff:fefd/64
10:23::ffff | 10:23::fe0b
- ::4.3.2.1/24 | ::4.3.0.13/24
(17 rows)
-SELECT i, c, i - c AS "minus" FROM inet_tbl;
+SELECT i, c, i - c AS "minus" FROM inet_tbl ORDER BY i, c;
i | c | minus
------------------+--------------------+------------------
- 192.168.1.226/24 | 192.168.1.0/24 | 226
- 192.168.1.226 | 192.168.1.0/26 | 226
- 192.168.1.0/24 | 192.168.1.0/24 | 0
- 192.168.1.0/25 | 192.168.1.0/24 | 0
- 192.168.1.255/24 | 192.168.1.0/24 | 255
- 192.168.1.255/25 | 192.168.1.0/24 | 255
+ 9.1.2.3/8 | 10.0.0.0/8 | -16711165
+ 10.1.2.3/8 | 10.0.0.0/8 | 66051
10.1.2.3/8 | 10.0.0.0/8 | 66051
10.1.2.3/8 | 10.0.0.0/32 | 66051
- 10.1.2.3 | 10.1.2.3/32 | 0
- 10.1.2.3/24 | 10.1.2.0/24 | 3
10.1.2.3/16 | 10.1.0.0/16 | 515
- 10.1.2.3/8 | 10.0.0.0/8 | 66051
+ 10.1.2.3/24 | 10.1.2.0/24 | 3
+ 10.1.2.3 | 10.1.2.3/32 | 0
11.1.2.3/8 | 10.0.0.0/8 | 16843267
- 9.1.2.3/8 | 10.0.0.0/8 | -16711165
+ 192.168.1.0/24 | 192.168.1.0/24 | 0
+ 192.168.1.226/24 | 192.168.1.0/24 | 226
+ 192.168.1.255/24 | 192.168.1.0/24 | 255
+ 192.168.1.0/25 | 192.168.1.0/24 | 0
+ 192.168.1.255/25 | 192.168.1.0/24 | 255
+ 192.168.1.226 | 192.168.1.0/26 | 226
+ ::4.3.2.1/24 | ::ffff:1.2.3.4/128 | -281470631346435
10:23::f1/64 | 10:23::f1/128 | 0
10:23::ffff | 10:23::8000/113 | 32767
- ::4.3.2.1/24 | ::ffff:1.2.3.4/128 | -281470631346435
(17 rows)
SELECT '127.0.0.1'::inet + 257;
ERROR: cannot rename inherited column "aa"
ALTER TABLE inhts RENAME d TO dd;
\d+ inhts
- Table "public.inhts"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- aa | integer | | plain | |
- b | integer | | plain | |
- c | integer | | plain | |
- dd | integer | | plain | |
+ Table "public.inhts"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ aa | integer | | | | plain | |
+ b | integer | | | | plain | |
+ c | integer | | | | plain | |
+ dd | integer | | | | plain | |
Inherits: inht1,
inhs1
+Distribute By: HASH(aa)
+Location Nodes: ALL DATANODES
DROP TABLE inhts;
-- Test for renaming in diamond inheritance
NOTICE: merging multiple inherited definitions of column "b"
ALTER TABLE inht1 RENAME aa TO aaa;
\d+ inht4
- Table "public.inht4"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- aaa | integer | | plain | |
- b | integer | | plain | |
- x | integer | | plain | |
- y | integer | | plain | |
- z | integer | | plain | |
+ Table "public.inht4"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ aaa | integer | | | | plain | |
+ b | integer | | | | plain | |
+ x | integer | | | | plain | |
+ y | integer | | | | plain | |
+ z | integer | | | | plain | |
Inherits: inht2,
inht3
+Distribute By: HASH(aaa)
+Location Nodes: ALL DATANODES
CREATE TABLE inhts (d int) INHERITS (inht2, inhs1);
NOTICE: merging multiple inherited definitions of column "b"
ALTER TABLE inht1 RENAME b TO bb; -- to be failed
ERROR: cannot rename inherited column "b"
\d+ inhts
- Table "public.inhts"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- aaaa | integer | | plain | |
- b | integer | | plain | |
- x | integer | | plain | |
- c | integer | | plain | |
- d | integer | | plain | |
+ Table "public.inhts"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ aaaa | integer | | | | plain | |
+ b | integer | | | | plain | |
+ x | integer | | | | plain | |
+ c | integer | | | | plain | |
+ d | integer | | | | plain | |
Inherits: inht2,
inhs1
+Distribute By: HASH(aaaa)
+Location Nodes: ALL DATANODES
WITH RECURSIVE r AS (
SELECT 'inht1'::regclass AS inhrelid
ALTER TABLE ONLY test_constraints DROP CONSTRAINT test_constraints_val1_val2_key;
\d+ test_constraints
- Table "public.test_constraints"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+-------------------+-----------+----------+--------------+-------------
- id | integer | | plain | |
- val1 | character varying | | extended | |
- val2 | integer | | plain | |
+ Table "public.test_constraints"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+-------------------+-----------+----------+---------+----------+--------------+-------------
+ id | integer | | | | plain | |
+ val1 | character varying | | | | extended | |
+ val2 | integer | | | | plain | |
Child tables: test_constraints_inh
+Distribute By: HASH(val1)
+Location Nodes: ALL DATANODES
\d+ test_constraints_inh
- Table "public.test_constraints_inh"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+-------------------+-----------+----------+--------------+-------------
- id | integer | | plain | |
- val1 | character varying | | extended | |
- val2 | integer | | plain | |
+ Table "public.test_constraints_inh"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+-------------------+-----------+----------+---------+----------+--------------+-------------
+ id | integer | | | | plain | |
+ val1 | character varying | | | | extended | |
+ val2 | integer | | | | plain | |
Inherits: test_constraints
+Distribute By: HASH(val1)
+Location Nodes: ALL DATANODES
DROP TABLE test_constraints_inh;
DROP TABLE test_constraints;
ALTER TABLE test_ex_constraints DROP CONSTRAINT test_ex_constraints_c_excl;
\d+ test_ex_constraints
- Table "public.test_ex_constraints"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+--------+-----------+---------+--------------+-------------
- c | circle | | plain | |
+ Table "public.test_ex_constraints"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+--------+-----------+----------+---------+---------+--------------+-------------
+ c | circle | | | | plain | |
Child tables: test_ex_constraints_inh
+Distribute By: ROUND ROBIN
+Location Nodes: ALL DATANODES
\d+ test_ex_constraints_inh
- Table "public.test_ex_constraints_inh"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+--------+-----------+---------+--------------+-------------
- c | circle | | plain | |
+ Table "public.test_ex_constraints_inh"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+--------+-----------+----------+---------+---------+--------------+-------------
+ c | circle | | | | plain | |
Inherits: test_ex_constraints
+Distribute By: ROUND ROBIN
+Location Nodes: ALL DATANODES
DROP TABLE test_ex_constraints_inh;
DROP TABLE test_ex_constraints;
"test_primary_constraints_pkey" PRIMARY KEY, btree (id)
Referenced by:
TABLE "test_foreign_constraints" CONSTRAINT "test_foreign_constraints_id1_fkey" FOREIGN KEY (id1) REFERENCES test_primary_constraints(id)
+Distribute By: HASH(id)
+Location Nodes: ALL DATANODES
\d+ test_foreign_constraints
- Table "public.test_foreign_constraints"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- id1 | integer | | plain | |
+ Table "public.test_foreign_constraints"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ id1 | integer | | | | plain | |
Foreign-key constraints:
"test_foreign_constraints_id1_fkey" FOREIGN KEY (id1) REFERENCES test_primary_constraints(id)
Child tables: test_foreign_constraints_inh
ALTER TABLE test_foreign_constraints DROP CONSTRAINT test_foreign_constraints_id1_fkey;
\d+ test_foreign_constraints
- Table "public.test_foreign_constraints"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- id1 | integer | | plain | |
+ Table "public.test_foreign_constraints"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ id1 | integer | | | | plain | |
Child tables: test_foreign_constraints_inh
+Distribute By: HASH(id1)
+Location Nodes: ALL DATANODES
\d+ test_foreign_constraints_inh
- Table "public.test_foreign_constraints_inh"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- id1 | integer | | plain | |
+ Table "public.test_foreign_constraints_inh"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ id1 | integer | | | | plain | |
Inherits: test_foreign_constraints
+Distribute By: HASH(id1)
+Location Nodes: ALL DATANODES
DROP TABLE test_foreign_constraints_inh;
DROP TABLE test_foreign_constraints;
LINE 1: ...xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
^
HINT: There is an entry for table "xx1", but it cannot be referenced from this part of the query.
+-- demonstrate problem with extrememly slow join
+CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION;
+INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000);
+CREATE TABLE testh (a int, b int);
+INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000);
+set enable_mergejoin TO false;
+set enable_hashjoin TO false;
+EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+ QUERY PLAN
+-----------------------------------------------------------------------------------
+ Finalize Aggregate
+ Output: count(*)
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
+ Output: PARTIAL count(*)
+ -> Partial Aggregate
+ Output: PARTIAL count(*)
+ -> Nested Loop Anti Join
+ Join Filter: (testr.b = testh.b)
+ -> Remote Subquery Scan on all (datanode_1)
+ Output: testr.b
+ Distribute results by H: b
+ -> Seq Scan on public.testr
+ Output: testr.b
+ -> Materialize
+ Output: testh.b
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
+ Output: testh.b
+ Distribute results by H: b
+ -> Seq Scan on public.testh
+ Output: testh.b
+(20 rows)
+
+SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+ count
+-------
+ 3000
+(1 row)
+
++
+ --
+ -- test planner's ability to mark joins as unique
+ --
+ create table j1 (id int primary key);
+ create table j2 (id int primary key);
+ create table j3 (id int);
+ insert into j1 values(1),(2),(3);
+ insert into j2 values(1),(2),(3);
+ insert into j3 values(1),(1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure join is properly marked as unique
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id = j2.id;
+ QUERY PLAN
+ -----------------------------------
+ Hash Join
+ Output: j1.id, j2.id
+ Inner Unique: true
+ Hash Cond: (j1.id = j2.id)
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Hash
+ Output: j2.id
+ -> Seq Scan on public.j2
+ Output: j2.id
+ (10 rows)
+
+ -- ensure join is not unique when not an equi-join
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id > j2.id;
+ QUERY PLAN
+ -----------------------------------
+ Nested Loop
+ Output: j1.id, j2.id
+ Join Filter: (j1.id > j2.id)
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Materialize
+ Output: j2.id
+ -> Seq Scan on public.j2
+ Output: j2.id
+ (9 rows)
+
+ -- ensure non-unique rel is not chosen as inner
+ explain (verbose, costs off)
+ select * from j1 inner join j3 on j1.id = j3.id;
+ QUERY PLAN
+ -----------------------------------
+ Hash Join
+ Output: j1.id, j3.id
+ Inner Unique: true
+ Hash Cond: (j3.id = j1.id)
+ -> Seq Scan on public.j3
+ Output: j3.id
+ -> Hash
+ Output: j1.id
+ -> Seq Scan on public.j1
+ Output: j1.id
+ (10 rows)
+
+ -- ensure left join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 left join j2 on j1.id = j2.id;
+ QUERY PLAN
+ -----------------------------------
+ Hash Left Join
+ Output: j1.id, j2.id
+ Inner Unique: true
+ Hash Cond: (j1.id = j2.id)
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Hash
+ Output: j2.id
+ -> Seq Scan on public.j2
+ Output: j2.id
+ (10 rows)
+
+ -- ensure right join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 right join j2 on j1.id = j2.id;
+ QUERY PLAN
+ -----------------------------------
+ Hash Left Join
+ Output: j1.id, j2.id
+ Inner Unique: true
+ Hash Cond: (j2.id = j1.id)
+ -> Seq Scan on public.j2
+ Output: j2.id
+ -> Hash
+ Output: j1.id
+ -> Seq Scan on public.j1
+ Output: j1.id
+ (10 rows)
+
+ -- ensure full join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 full join j2 on j1.id = j2.id;
+ QUERY PLAN
+ -----------------------------------
+ Hash Full Join
+ Output: j1.id, j2.id
+ Inner Unique: true
+ Hash Cond: (j1.id = j2.id)
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Hash
+ Output: j2.id
+ -> Seq Scan on public.j2
+ Output: j2.id
+ (10 rows)
+
+ -- a clauseless (cross) join can't be unique
+ explain (verbose, costs off)
+ select * from j1 cross join j2;
+ QUERY PLAN
+ -----------------------------------
+ Nested Loop
+ Output: j1.id, j2.id
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Materialize
+ Output: j2.id
+ -> Seq Scan on public.j2
+ Output: j2.id
+ (8 rows)
+
+ -- ensure a natural join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 natural join j2;
+ QUERY PLAN
+ -----------------------------------
+ Hash Join
+ Output: j1.id
+ Inner Unique: true
+ Hash Cond: (j1.id = j2.id)
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Hash
+ Output: j2.id
+ -> Seq Scan on public.j2
+ Output: j2.id
+ (10 rows)
+
+ -- ensure a distinct clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select distinct id from j3) j3 on j1.id = j3.id;
+ QUERY PLAN
+ -----------------------------------------------
+ Nested Loop
+ Output: j1.id, j3.id
+ Inner Unique: true
+ Join Filter: (j1.id = j3.id)
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Materialize
+ Output: j3.id
+ -> Unique
+ Output: j3.id
+ -> Sort
+ Output: j3.id
+ Sort Key: j3.id
+ -> Seq Scan on public.j3
+ Output: j3.id
+ (15 rows)
+
+ -- ensure group by clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
+ QUERY PLAN
+ -----------------------------------------------
+ Nested Loop
+ Output: j1.id, j3.id
+ Inner Unique: true
+ Join Filter: (j1.id = j3.id)
+ -> Seq Scan on public.j1
+ Output: j1.id
+ -> Materialize
+ Output: j3.id
+ -> Group
+ Output: j3.id
+ Group Key: j3.id
+ -> Sort
+ Output: j3.id
+ Sort Key: j3.id
+ -> Seq Scan on public.j3
+ Output: j3.id
+ (16 rows)
+
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- test more complex permutations of unique joins
+ create table j1 (id1 int, id2 int, primary key(id1,id2));
+ create table j2 (id1 int, id2 int, primary key(id1,id2));
+ create table j3 (id1 int, id2 int, primary key(id1,id2));
+ insert into j1 values(1,1),(1,2);
+ insert into j2 values(1,1);
+ insert into j3 values(1,1);
+ analyze j1;
+ analyze j2;
+ analyze j3;
+ -- ensure there's no unique join when not all columns which are part of the
+ -- unique index are seen in the join clause
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1;
+ QUERY PLAN
+ ------------------------------------------
+ Nested Loop
+ Output: j1.id1, j1.id2, j2.id1, j2.id2
+ Join Filter: (j1.id1 = j2.id1)
+ -> Seq Scan on public.j2
+ Output: j2.id1, j2.id2
+ -> Seq Scan on public.j1
+ Output: j1.id1, j1.id2
+ (7 rows)
+
+ -- ensure proper unique detection with multiple join quals
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
+ QUERY PLAN
+ ----------------------------------------------------------
+ Nested Loop
+ Output: j1.id1, j1.id2, j2.id1, j2.id2
+ Inner Unique: true
+ Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+ -> Seq Scan on public.j1
+ Output: j1.id1, j1.id2
+ -> Materialize
+ Output: j2.id1, j2.id2
+ -> Seq Scan on public.j2
+ Output: j2.id1, j2.id2
+ (10 rows)
+
+ -- ensure we don't detect the join to be unique when quals are not part of the
+ -- join condition
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+ QUERY PLAN
+ ------------------------------------------
+ Nested Loop
+ Output: j1.id1, j1.id2, j2.id1, j2.id2
+ Join Filter: (j1.id1 = j2.id1)
+ -> Seq Scan on public.j1
+ Output: j1.id1, j1.id2
+ Filter: (j1.id2 = 1)
+ -> Seq Scan on public.j2
+ Output: j2.id1, j2.id2
+ (8 rows)
+
+ -- as above, but for left joins.
+ explain (verbose, costs off)
+ select * from j1
+ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+ QUERY PLAN
+ ------------------------------------------
+ Nested Loop Left Join
+ Output: j1.id1, j1.id2, j2.id1, j2.id2
+ Join Filter: (j1.id1 = j2.id1)
+ -> Seq Scan on public.j1
+ Output: j1.id1, j1.id2
+ Filter: (j1.id2 = 1)
+ -> Seq Scan on public.j2
+ Output: j2.id1, j2.id2
+ (8 rows)
+
+ -- validate logic in merge joins which skips mark and restore.
+ -- it should only do this if all quals which were used to detect the unique
+ -- are present as join quals, and not plain quals.
+ set enable_nestloop to 0;
+ set enable_hashjoin to 0;
+ set enable_sort to 0;
+ -- create an index that will be preferred over the PK to perform the join
+ create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
+ explain (costs off) select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+ QUERY PLAN
+ --------------------------------------------
+ Merge Join
+ Merge Cond: (j1.id1 = j2.id1)
+ Join Filter: (j1.id2 = j2.id2)
+ -> Index Scan using j1_id1_idx on j1
+ -> Index Scan using j1_id1_idx on j1 j2
+ (5 rows)
+
+ select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+ id1 | id2 | id1 | id2
+ -----+-----+-----+-----
+ 1 | 1 | 1 | 1
+ 1 | 2 | 1 | 2
+ (2 rows)
+
+ reset enable_nestloop;
+ reset enable_hashjoin;
+ reset enable_sort;
+ drop table j1;
+ drop table j2;
+ drop table j3;
+ -- check that semijoin inner is not seen as unique for a portion of the outerrel
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from tenk1 t3
+ where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+ and t1.unique1 < 1;
+ QUERY PLAN
+ ---------------------------------------------------------------------------------
+ Nested Loop
+ Output: t1.unique1, t2.hundred
+ -> Hash Join
+ Output: t1.unique1, t3.tenthous
+ Hash Cond: (t3.thousand = t1.unique1)
+ -> HashAggregate
+ Output: t3.thousand, t3.tenthous
+ Group Key: t3.thousand, t3.tenthous
+ -> Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3
+ Output: t3.thousand, t3.tenthous
+ -> Hash
+ Output: t1.unique1
+ -> Index Only Scan using onek_unique1 on public.onek t1
+ Output: t1.unique1
+ Index Cond: (t1.unique1 < 1)
+ -> Index Only Scan using tenk1_hundred on public.tenk1 t2
+ Output: t2.hundred
+ Index Cond: (t2.hundred = t3.tenthous)
+ (18 rows)
+
+ -- ... unless it actually is unique
+ create table j3 as select unique1, tenthous from onek;
+ vacuum analyze j3;
+ create unique index on j3(unique1, tenthous);
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from j3
+ where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
+ and t1.unique1 < 1;
+ QUERY PLAN
+ ------------------------------------------------------------------------
+ Nested Loop
+ Output: t1.unique1, t2.hundred
+ -> Nested Loop
+ Output: t1.unique1, j3.tenthous
+ -> Index Only Scan using onek_unique1 on public.onek t1
+ Output: t1.unique1
+ Index Cond: (t1.unique1 < 1)
+ -> Index Only Scan using j3_unique1_tenthous_idx on public.j3
+ Output: j3.unique1, j3.tenthous
+ Index Cond: (j3.unique1 = t1.unique1)
+ -> Index Only Scan using tenk1_hundred on public.tenk1 t2
+ Output: t2.hundred
+ Index Cond: (t2.hundred = j3.tenthous)
+ (13 rows)
+
+ drop table j3;
-$12,345,678,901,234,567.00
(1 row)
+ SELECT (-12345678901234567)::numeric::money;
+ money
+ -----------------------------
+ -$12,345,678,901,234,567.00
+ (1 row)
+
+ -- Cast from money
+ SELECT '12345678901234567'::money::numeric;
+ numeric
+ ----------------------
+ 12345678901234567.00
+ (1 row)
+
+ SELECT '-12345678901234567'::money::numeric;
+ numeric
+ -----------------------
+ -12345678901234567.00
+ (1 row)
+
+INSERT INTO money_data VALUES ('$223.459');
+INSERT INTO money_data VALUES ('$323.459');
+INSERT INTO money_data VALUES ('$423.459');
+INSERT INTO money_data VALUES ('$523.459');
+SELECT sum(m) FROM money_data;
+ sum
+-----------
+ $1,617.30
+(1 row)
+
+CREATE TABLE money_data2 (a int, m money);
+INSERT INTO money_data2 VALUES (1, '$123.459');
+INSERT INTO money_data2 VALUES (2, '$223.459');
+INSERT INTO money_data2 VALUES (3, '$323.459');
+INSERT INTO money_data2 VALUES (4, '$423.459');
+INSERT INTO money_data2 VALUES (5, '$523.459');
+SELECT sum(m) FROM money_data2;
+ sum
+-----------
+ $1,617.30
+(1 row)
+
+DROP TABLE money_data2;
-- event trigger
('policy', '{addr_nsp, gentable, genpol}', '{}'),
('transform', '{int}', '{sql}'),
- ('access method', '{btree}', '{}')
+ ('access method', '{btree}', '{}'),
+ ('publication', '{addr_pub}', '{}'),
+ ('publication relation', '{addr_nsp, gentable}', '{addr_pub}'),
+ ('subscription', '{addr_sub}', '{}'),
+ ('statistics object', '{addr_nsp, gentable_stat}', '{}')
)
- SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.subobjid)).*,
+ SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*,
-- test roundtrip through pg_identify_object_as_address
- ROW(pg_identify_object(addr1.classid, addr1.objid, addr1.subobjid)) =
- ROW(pg_identify_object(addr2.classid, addr2.objid, addr2.subobjid))
+ ROW(pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)) =
+ ROW(pg_identify_object(addr2.classid, addr2.objid, addr2.objsubid))
FROM objects, pg_get_object_address(type, name, args) addr1,
- pg_identify_object_as_address(classid, objid, subobjid) ioa(typ,nms,args),
+ pg_identify_object_as_address(classid, objid, objsubid) ioa(typ,nms,args),
pg_get_object_address(typ, nms, ioa.args) as addr2
- ORDER BY addr1.classid, addr1.objid, addr1.objsubid;
- type | schema | name | identity | ?column?
----------------------------+------------+-------------------+----------------------------------------------------------------------+----------
- default acl | | | for role regress_addr_user in schema public on tables | t
- default acl | | | for role regress_addr_user on tables | t
- type | pg_catalog | _int4 | integer[] | t
- type | addr_nsp | gencomptype | addr_nsp.gencomptype | t
- type | addr_nsp | genenum | addr_nsp.genenum | t
- type | addr_nsp | gendomain | addr_nsp.gendomain | t
- function | pg_catalog | | pg_catalog.pg_identify_object(pg_catalog.oid,pg_catalog.oid,integer) | t
- aggregate | addr_nsp | | addr_nsp.genaggr(integer) | t
- sequence | addr_nsp | gentable_a_seq | addr_nsp.gentable_a_seq | t
- table | addr_nsp | gentable | addr_nsp.gentable | t
- table column | addr_nsp | gentable | addr_nsp.gentable.b | t
- index | addr_nsp | gentable_pkey | addr_nsp.gentable_pkey | t
- view | addr_nsp | genview | addr_nsp.genview | t
- materialized view | addr_nsp | genmatview | addr_nsp.genmatview | t
- foreign table | addr_nsp | genftable | addr_nsp.genftable | t
- foreign table column | addr_nsp | genftable | addr_nsp.genftable.a | t
- role | | regress_addr_user | regress_addr_user | t
- server | | addr_fserv | addr_fserv | t
- user mapping | | | regress_addr_user on server integer | t
- foreign-data wrapper | | addr_fdw | addr_fdw | t
- access method | | btree | btree | t
- operator of access method | | | operator 1 (integer, integer) of pg_catalog.integer_ops USING btree | t
- function of access method | | | function 2 (integer, integer) of pg_catalog.integer_ops USING btree | t
- default value | | | for addr_nsp.gentable.b | t
- cast | | | (bigint AS integer) | t
- table constraint | addr_nsp | | a_chk on addr_nsp.gentable | t
- domain constraint | addr_nsp | | domconstr on addr_nsp.gendomain | t
- conversion | pg_catalog | ascii_to_mic | pg_catalog.ascii_to_mic | t
- language | | plpgsql | plpgsql | t
- schema | | addr_nsp | addr_nsp | t
- operator class | pg_catalog | int4_ops | pg_catalog.int4_ops USING btree | t
- operator | pg_catalog | | pg_catalog.+(integer,integer) | t
- rule | | | "_RETURN" on addr_nsp.genview | t
- trigger | | | t on addr_nsp.gentable | t
- operator family | pg_catalog | integer_ops | pg_catalog.integer_ops USING btree | t
- policy | | | genpol on addr_nsp.gentable | t
- statistics object | addr_nsp | gentable_stat | addr_nsp.gentable_stat | t
- collation | pg_catalog | "default" | pg_catalog."default" | t
- transform | | | for integer on language sql | t
- text search dictionary | addr_nsp | addr_ts_dict | addr_nsp.addr_ts_dict | t
- text search parser | addr_nsp | addr_ts_prs | addr_nsp.addr_ts_prs | t
- text search configuration | addr_nsp | addr_ts_conf | addr_nsp.addr_ts_conf | t
- text search template | addr_nsp | addr_ts_temp | addr_nsp.addr_ts_temp | t
- subscription | | addr_sub | addr_sub | t
- publication | | addr_pub | addr_pub | t
- publication relation | | | gentable in publication addr_pub | t
-(46 rows)
-
+ ORDER BY addr1.classid, addr1.objid, addr1.subobjid;
+ERROR: relation "addr_nsp.genftable" does not exist
---
--- Cleanup resources
---
SET client_min_messages TO 'warning';
DROP FOREIGN DATA WRAPPER addr_fdw CASCADE;
+ERROR: foreign-data wrapper "addr_fdw" does not exist
+ DROP PUBLICATION addr_pub;
+ DROP SUBSCRIPTION addr_sub;
DROP SCHEMA addr_nsp CASCADE;
DROP OWNED BY regress_addr_user;
DROP USER regress_addr_user;
get diagnostics rc = row_count;
raise notice '% %', found, rc;
return query execute 'values(10),(20)';
- get diagnostics rc = row_count;
- raise notice '% %', found, rc;
+ -- just for fun, let's use array elements as targets
+ get diagnostics rca[1] = row_count;
+ raise notice '% %', found, rca[1];
return query execute 'select * from (values(10),(20)) f(a) where false';
- get diagnostics rc = row_count;
- raise notice '% %', found, rc;
+ get diagnostics rca[2] = row_count;
+ raise notice '% %', found, rca[2];
end;
$$ language plpgsql;
-select * from rttest();
+select * from rttest() order by 1;
NOTICE: t 2
NOTICE: f 0
NOTICE: t 2
rollback;
-- Commit table creation
COMMIT PREPARED 'regress-one';
+ERROR: prepared transaction with identifier "regress-one" does not exist
\d pxtest2
+ Table "public.pxtest2"
+ Column | Type | Collation | Nullable | Default
+ --------+---------+-----------+----------+---------
+ a | integer | | |
+
SELECT * FROM pxtest2;
- a
----
- 1
- 3
-(2 rows)
-
+ERROR: relation "pxtest2" does not exist
+LINE 1: SELECT * FROM pxtest2;
+ ^
-- There should be one prepared transaction
-SELECT gid FROM pg_prepared_xacts;
+SELECT gid FROM pg_prepared_xacts ORDER BY 1;
gid
-------------
regress-two
DROP TABLE atestc;
DROP TABLE atestp1;
DROP TABLE atestp2;
- SELECT lo_unlink(oid) FROM pg_largeobject_metadata;
+ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3000 ORDER BY oid;
lo_unlink
-----------
- 1
- 1
- 1
- 1
- 1
-(5 rows)
+(0 rows)
DROP GROUP regress_group1;
DROP GROUP regress_group2;
4 | 44 | 1 | regress_rls_bob | my first manga
6 | 22 | 1 | regress_rls_carol | great science fiction
8 | 44 | 1 | regress_rls_carol | great manga
- (4 rows)
+ 9 | 22 | 1 | regress_rls_dave | awesome science fiction
+ (5 rows)
SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
-NOTICE: f_leak => my first novel
-NOTICE: f_leak => my first manga
-NOTICE: f_leak => great science fiction
-NOTICE: f_leak => great manga
-NOTICE: f_leak => awesome science fiction
- cid | did | dlevel | dauthor | dtitle | cname
------+-----+--------+-------------------+-------------------------+-----------------
- 11 | 1 | 1 | regress_rls_bob | my first novel | novel
- 44 | 4 | 1 | regress_rls_bob | my first manga | manga
- 22 | 6 | 1 | regress_rls_carol | great science fiction | science fiction
- 44 | 8 | 1 | regress_rls_carol | great manga | manga
- 22 | 9 | 1 | regress_rls_dave | awesome science fiction | science fiction
-(5 rows)
+ cid | did | dlevel | dauthor | dtitle | cname
+-----+-----+--------+-------------------+-----------------------+-----------------
+ 11 | 1 | 1 | regress_rls_bob | my first novel | novel
+ 44 | 4 | 1 | regress_rls_bob | my first manga | manga
+ 22 | 6 | 1 | regress_rls_carol | great science fiction | science fiction
+ 44 | 8 | 1 | regress_rls_carol | great manga | manga
+(4 rows)
-- try a sampled version
SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0)
6 | 22 | 1 | regress_rls_carol | great science fiction
7 | 33 | 2 | regress_rls_carol | great technology book
8 | 44 | 1 | regress_rls_carol | great manga
- (8 rows)
+ 9 | 22 | 1 | regress_rls_dave | awesome science fiction
+ 10 | 33 | 2 | regress_rls_dave | awesome technology book
+ (10 rows)
SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
-NOTICE: f_leak => my first novel
-NOTICE: f_leak => my second novel
-NOTICE: f_leak => my science fiction
-NOTICE: f_leak => my first manga
-NOTICE: f_leak => my second manga
-NOTICE: f_leak => great science fiction
-NOTICE: f_leak => great technology book
-NOTICE: f_leak => great manga
-NOTICE: f_leak => awesome science fiction
-NOTICE: f_leak => awesome technology book
- cid | did | dlevel | dauthor | dtitle | cname
------+-----+--------+-------------------+-------------------------+-----------------
- 11 | 1 | 1 | regress_rls_bob | my first novel | novel
- 11 | 2 | 2 | regress_rls_bob | my second novel | novel
- 22 | 3 | 2 | regress_rls_bob | my science fiction | science fiction
- 44 | 4 | 1 | regress_rls_bob | my first manga | manga
- 44 | 5 | 2 | regress_rls_bob | my second manga | manga
- 22 | 6 | 1 | regress_rls_carol | great science fiction | science fiction
- 33 | 7 | 2 | regress_rls_carol | great technology book | technology
- 44 | 8 | 1 | regress_rls_carol | great manga | manga
- 22 | 9 | 1 | regress_rls_dave | awesome science fiction | science fiction
- 33 | 10 | 2 | regress_rls_dave | awesome technology book | technology
-(10 rows)
+ cid | did | dlevel | dauthor | dtitle | cname
+-----+-----+--------+-------------------+-----------------------+-----------------
+ 11 | 1 | 1 | regress_rls_bob | my first novel | novel
+ 11 | 2 | 2 | regress_rls_bob | my second novel | novel
+ 22 | 3 | 2 | regress_rls_bob | my science fiction | science fiction
+ 44 | 4 | 1 | regress_rls_bob | my first manga | manga
+ 44 | 5 | 2 | regress_rls_bob | my second manga | manga
+ 22 | 6 | 1 | regress_rls_carol | great science fiction | science fiction
+ 33 | 7 | 2 | regress_rls_carol | great technology book | technology
+ 44 | 8 | 1 | regress_rls_carol | great manga | manga
+(8 rows)
-- try a sampled version
SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0)
WHERE f_leak(dtitle) ORDER BY did;
-NOTICE: f_leak => my first manga
-NOTICE: f_leak => my second manga
-NOTICE: f_leak => great science fiction
-NOTICE: f_leak => great manga
-NOTICE: f_leak => awesome science fiction
- did | cid | dlevel | dauthor | dtitle
------+-----+--------+-------------------+-------------------------
- 4 | 44 | 1 | regress_rls_bob | my first manga
- 5 | 44 | 2 | regress_rls_bob | my second manga
+ did | cid | dlevel | dauthor | dtitle
+-----+-----+--------+-------------------+-----------------------
6 | 22 | 1 | regress_rls_carol | great science fiction
8 | 44 | 1 | regress_rls_carol | great manga
- 9 | 22 | 1 | regress_rls_dave | awesome science fiction
-(5 rows)
+(2 rows)
EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle);
- QUERY PLAN
-----------------------------------------------------
- Seq Scan on document
- Filter: ((dlevel <= $0) AND f_leak(dtitle))
- InitPlan 1 (returns $0)
- -> Index Scan using uaccount_pkey on uaccount
- Index Cond: (pguser = CURRENT_USER)
-(5 rows)
-
-EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle);
- QUERY PLAN
------------------------------------------------------------
- Hash Join
- Hash Cond: (category.cid = document.cid)
- InitPlan 1 (returns $0)
- -> Index Scan using uaccount_pkey on uaccount
- Index Cond: (pguser = CURRENT_USER)
- -> Seq Scan on category
- -> Hash
- -> Seq Scan on document
- Filter: ((dlevel <= $0) AND f_leak(dtitle))
+ QUERY PLAN
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Subquery Scan on document
+ Filter: f_leak(document.dtitle)
+ -> Seq Scan on document document_1
+ Filter: (dlevel <= $0)
+ InitPlan 1 (returns $0)
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Index Scan using uaccount_pkey on uaccount
+ Index Cond: (pguser = "current_user"())
(9 rows)
--- viewpoint from regress_rls_dave
-SET SESSION AUTHORIZATION regress_rls_dave;
-SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did;
-NOTICE: f_leak => my first novel
-NOTICE: f_leak => my second novel
-NOTICE: f_leak => my science fiction
-NOTICE: f_leak => great science fiction
-NOTICE: f_leak => great technology book
-NOTICE: f_leak => awesome science fiction
-NOTICE: f_leak => awesome technology book
- did | cid | dlevel | dauthor | dtitle
------+-----+--------+-------------------+-------------------------
- 1 | 11 | 1 | regress_rls_bob | my first novel
- 2 | 11 | 2 | regress_rls_bob | my second novel
- 3 | 22 | 2 | regress_rls_bob | my science fiction
- 6 | 22 | 1 | regress_rls_carol | great science fiction
- 7 | 33 | 2 | regress_rls_carol | great technology book
- 9 | 22 | 1 | regress_rls_dave | awesome science fiction
- 10 | 33 | 2 | regress_rls_dave | awesome technology book
-(7 rows)
-
-SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did;
-NOTICE: f_leak => my first novel
-NOTICE: f_leak => my second novel
-NOTICE: f_leak => my science fiction
-NOTICE: f_leak => great science fiction
-NOTICE: f_leak => great technology book
-NOTICE: f_leak => awesome science fiction
-NOTICE: f_leak => awesome technology book
- cid | did | dlevel | dauthor | dtitle | cname
------+-----+--------+-------------------+-------------------------+-----------------
- 11 | 1 | 1 | regress_rls_bob | my first novel | novel
- 11 | 2 | 2 | regress_rls_bob | my second novel | novel
- 22 | 3 | 2 | regress_rls_bob | my science fiction | science fiction
- 22 | 6 | 1 | regress_rls_carol | great science fiction | science fiction
- 33 | 7 | 2 | regress_rls_carol | great technology book | technology
- 22 | 9 | 1 | regress_rls_dave | awesome science fiction | science fiction
- 33 | 10 | 2 | regress_rls_dave | awesome technology book | technology
-(7 rows)
-
-EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle);
- QUERY PLAN
-----------------------------------------------------------------------------------------------
- Seq Scan on document
- Filter: ((cid <> 44) AND (cid <> 44) AND (cid < 50) AND (dlevel <= $0) AND f_leak(dtitle))
- InitPlan 1 (returns $0)
- -> Index Scan using uaccount_pkey on uaccount
- Index Cond: (pguser = CURRENT_USER)
-(5 rows)
-
EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle);
- QUERY PLAN
-----------------------------------------------------------------------------------------------------------
- Hash Join
- Hash Cond: (category.cid = document.cid)
- InitPlan 1 (returns $0)
- -> Index Scan using uaccount_pkey on uaccount
- Index Cond: (pguser = CURRENT_USER)
- -> Seq Scan on category
- -> Hash
- -> Seq Scan on document
- Filter: ((cid <> 44) AND (cid <> 44) AND (cid < 50) AND (dlevel <= $0) AND f_leak(dtitle))
-(9 rows)
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Hash Join
+ Hash Cond: (category.cid = document.cid)
+ -> Seq Scan on category
+ -> Hash
+ -> Subquery Scan on document
+ Filter: f_leak(document.dtitle)
+ -> Seq Scan on document document_1
+ Filter: (dlevel <= $0)
+ InitPlan 1 (returns $0)
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Index Scan using uaccount_pkey on uaccount
+ Index Cond: (pguser = "current_user"())
+(13 rows)
+ -- 44 would technically fail for both p2r and p1r, but we should get an error
+ -- back from p1r for this because it sorts first
+ INSERT INTO document VALUES (100, 44, 1, 'regress_rls_dave', 'testing sorting of policies'); -- fail
+ ERROR: new row violates row-level security policy "p1r" for table "document"
+ -- Just to see a p2r error
+ INSERT INTO document VALUES (100, 55, 1, 'regress_rls_dave', 'testing sorting of policies'); -- fail
+ ERROR: new row violates row-level security policy "p2r" for table "document"
-- only owner can change policies
ALTER POLICY p1 ON document USING (true); --fail
ERROR: must be owner of relation document
ALTER TABLE category ENABLE ROW LEVEL SECURITY;
-- cannot delete PK referenced by invisible FK
SET SESSION AUTHORIZATION regress_rls_bob;
- SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid;
+ SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid ORDER BY d.did, c.cid;
did | cid | dlevel | dauthor | dtitle | cid | cname
-----+-----+--------+-----------------+--------------------+-----+------------
- 1 | 11 | 1 | regress_rls_bob | my first novel | 11 | novel
+ 4 | 44 | 1 | regress_rls_bob | my first manga | |
+ 5 | 44 | 2 | regress_rls_bob | my second manga | |
2 | 11 | 2 | regress_rls_bob | my second novel | 11 | novel
+ 1 | 11 | 1 | regress_rls_bob | my first novel | 11 | novel
+ | | | | | 33 | technology
3 | 22 | 2 | regress_rls_bob | my science fiction | |
+ 4 | 44 | 1 | regress_rls_bob | my first manga | |
+ 5 | 44 | 2 | regress_rls_bob | my second manga | |
+ | | | | | 33 | technology
(6 rows)
DELETE FROM category WHERE cid = 33; -- fails with FK violation
DETAIL: Key is still referenced from table "document".
-- can insert FK referencing invisible PK
SET SESSION AUTHORIZATION regress_rls_carol;
- SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid;
+ SELECT * FROM document d FULL OUTER JOIN category c on d.cid = c.cid ORDER BY d.did, c.cid;
did | cid | dlevel | dauthor | dtitle | cid | cname
-----+-----+--------+-------------------+-----------------------+-----+-----------------
+ 8 | 44 | 1 | regress_rls_carol | great manga | 44 | manga
6 | 22 | 1 | regress_rls_carol | great science fiction | 22 | science fiction
7 | 33 | 2 | regress_rls_carol | great technology book | |
+ 8 | 44 | 1 | regress_rls_carol | great manga | 44 | manga
(3 rows)
- INSERT INTO document VALUES (10, 33, 1, current_user, 'hoge');
+ INSERT INTO document VALUES (11, 33, 1, current_user, 'hoge');
-- UNIQUE or PRIMARY KEY constraint violation DOES reveal presence of row
SET SESSION AUTHORIZATION regress_rls_bob;
INSERT INTO document VALUES (8, 44, 1, 'regress_rls_bob', 'my third manga'); -- Must fail with unique violation, revealing presence of did we can't see
RESET SESSION AUTHORIZATION;
SET row_security TO ON;
SELECT * FROM document;
- did | cid | dlevel | dauthor | dtitle
- -----+-----+--------+-------------------+-----------------------
+ did | cid | dlevel | dauthor | dtitle
+ -----+-----+--------+-------------------+-------------------------
1 | 11 | 1 | regress_rls_bob | my first novel
2 | 11 | 2 | regress_rls_bob | my second novel
- 3 | 22 | 2 | regress_rls_bob | my science fiction
- 4 | 44 | 1 | regress_rls_bob | my first manga
5 | 44 | 2 | regress_rls_bob | my second manga
6 | 22 | 1 | regress_rls_carol | great science fiction
- 7 | 33 | 2 | regress_rls_carol | great technology book
8 | 44 | 1 | regress_rls_carol | great manga
- 9 | 22 | 1 | regress_rls_dave | awesome science fiction
- 10 | 33 | 2 | regress_rls_dave | awesome technology book
- 11 | 33 | 1 | regress_rls_carol | hoge
-(11 rows)
+ 3 | 22 | 2 | regress_rls_bob | my science fiction
+ 4 | 44 | 1 | regress_rls_bob | my first manga
+ 7 | 33 | 2 | regress_rls_carol | great technology book
+ 10 | 33 | 1 | regress_rls_carol | hoge
+(9 rows)
SELECT * FROM category;
cid | cname
RESET SESSION AUTHORIZATION;
SET row_security TO OFF;
SELECT * FROM document;
- did | cid | dlevel | dauthor | dtitle
- -----+-----+--------+-------------------+-----------------------
+ did | cid | dlevel | dauthor | dtitle
+ -----+-----+--------+-------------------+-------------------------
1 | 11 | 1 | regress_rls_bob | my first novel
2 | 11 | 2 | regress_rls_bob | my second novel
- 3 | 22 | 2 | regress_rls_bob | my science fiction
- 4 | 44 | 1 | regress_rls_bob | my first manga
5 | 44 | 2 | regress_rls_bob | my second manga
6 | 22 | 1 | regress_rls_carol | great science fiction
- 7 | 33 | 2 | regress_rls_carol | great technology book
8 | 44 | 1 | regress_rls_carol | great manga
- 9 | 22 | 1 | regress_rls_dave | awesome science fiction
- 10 | 33 | 2 | regress_rls_dave | awesome technology book
- 11 | 33 | 1 | regress_rls_carol | hoge
-(11 rows)
+ 3 | 22 | 2 | regress_rls_bob | my science fiction
+ 4 | 44 | 1 | regress_rls_bob | my first manga
+ 7 | 33 | 2 | regress_rls_carol | great technology book
+ 10 | 33 | 1 | regress_rls_carol | hoge
+(9 rows)
SELECT * FROM category;
cid | cname
SET SESSION AUTHORIZATION regress_rls_exempt_user;
SET row_security TO OFF;
SELECT * FROM document;
- did | cid | dlevel | dauthor | dtitle
- -----+-----+--------+-------------------+-----------------------
+ did | cid | dlevel | dauthor | dtitle
+ -----+-----+--------+-------------------+-------------------------
1 | 11 | 1 | regress_rls_bob | my first novel
2 | 11 | 2 | regress_rls_bob | my second novel
- 3 | 22 | 2 | regress_rls_bob | my science fiction
- 4 | 44 | 1 | regress_rls_bob | my first manga
5 | 44 | 2 | regress_rls_bob | my second manga
6 | 22 | 1 | regress_rls_carol | great science fiction
- 7 | 33 | 2 | regress_rls_carol | great technology book
8 | 44 | 1 | regress_rls_carol | great manga
- 9 | 22 | 1 | regress_rls_dave | awesome science fiction
- 10 | 33 | 2 | regress_rls_dave | awesome technology book
- 11 | 33 | 1 | regress_rls_carol | hoge
-(11 rows)
+ 3 | 22 | 2 | regress_rls_bob | my science fiction
+ 4 | 44 | 1 | regress_rls_bob | my first manga
+ 7 | 33 | 2 | regress_rls_carol | great technology book
+ 10 | 33 | 1 | regress_rls_carol | hoge
+(9 rows)
SELECT * FROM category;
cid | cname
SET SESSION AUTHORIZATION regress_rls_alice;
SET row_security TO ON;
SELECT * FROM document;
- did | cid | dlevel | dauthor | dtitle
- -----+-----+--------+-------------------+-----------------------
+ did | cid | dlevel | dauthor | dtitle
+ -----+-----+--------+-------------------+-------------------------
1 | 11 | 1 | regress_rls_bob | my first novel
2 | 11 | 2 | regress_rls_bob | my second novel
- 3 | 22 | 2 | regress_rls_bob | my science fiction
- 4 | 44 | 1 | regress_rls_bob | my first manga
5 | 44 | 2 | regress_rls_bob | my second manga
6 | 22 | 1 | regress_rls_carol | great science fiction
- 7 | 33 | 2 | regress_rls_carol | great technology book
8 | 44 | 1 | regress_rls_carol | great manga
- 9 | 22 | 1 | regress_rls_dave | awesome science fiction
- 10 | 33 | 2 | regress_rls_dave | awesome technology book
- 11 | 33 | 1 | regress_rls_carol | hoge
-(11 rows)
+ 3 | 22 | 2 | regress_rls_bob | my science fiction
+ 4 | 44 | 1 | regress_rls_bob | my first manga
+ 7 | 33 | 2 | regress_rls_carol | great technology book
+ 10 | 33 | 1 | regress_rls_carol | hoge
+(9 rows)
SELECT * FROM category;
cid | cname
SET SESSION AUTHORIZATION regress_rls_alice;
SET row_security TO OFF;
SELECT * FROM document;
- did | cid | dlevel | dauthor | dtitle
- -----+-----+--------+-------------------+-----------------------
+ did | cid | dlevel | dauthor | dtitle
+ -----+-----+--------+-------------------+-------------------------
1 | 11 | 1 | regress_rls_bob | my first novel
2 | 11 | 2 | regress_rls_bob | my second novel
- 3 | 22 | 2 | regress_rls_bob | my science fiction
- 4 | 44 | 1 | regress_rls_bob | my first manga
5 | 44 | 2 | regress_rls_bob | my second manga
6 | 22 | 1 | regress_rls_carol | great science fiction
- 7 | 33 | 2 | regress_rls_carol | great technology book
8 | 44 | 1 | regress_rls_carol | great manga
- 9 | 22 | 1 | regress_rls_dave | awesome science fiction
- 10 | 33 | 2 | regress_rls_dave | awesome technology book
- 11 | 33 | 1 | regress_rls_carol | hoge
-(11 rows)
+ 3 | 22 | 2 | regress_rls_bob | my science fiction
+ 4 | 44 | 1 | regress_rls_bob | my first manga
+ 7 | 33 | 2 | regress_rls_carol | great technology book
+ 10 | 33 | 1 | regress_rls_carol | hoge
+(9 rows)
SELECT * FROM category;
cid | cname
(2 rows)
EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
- QUERY PLAN
------------------------------------------
- Seq Scan on z1
- Filter: (((a % 2) = 0) AND f_leak(b))
-(2 rows)
-
+ QUERY PLAN
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Subquery Scan on z1
+ Filter: f_leak(z1.b)
+ -> Seq Scan on z1 z1_1
+ Filter: ((a % 2) = 0)
+(5 rows)
-
-- Query as role that is not owner of table but is owner of view.
-- Should return records based on view owner policies.
SET SESSION AUTHORIZATION regress_rls_bob;
shoename char(10), -- primary key
sh_avail integer, -- available # of pairs
slcolor char(10), -- preferred shoelace color
- slminlen float, -- miminum shoelace length
+ slminlen float, -- minimum shoelace length
slmaxlen float, -- maximum shoelace length
slunit char(8) -- length unit
-);
+) distribute by roundrobin;
CREATE TABLE shoelace_data (
sl_name char(10), -- primary key
sl_avail integer, -- available # of pairs
(12 rows)
create rule r3 as on delete to rules_src do notify rules_src_deletion;
+ERROR: Rule may not use NOTIFY, it is not yet supported
\d+ rules_src
- Table "public.rules_src"
- Column | Type | Modifiers | Storage | Stats target | Description
- --------+---------+-----------+---------+--------------+-------------
- f1 | integer | | plain | |
- f2 | integer | | plain | |
+ Table "public.rules_src"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+ --------+---------+-----------+----------+---------+---------+--------------+-------------
+ f1 | integer | | | | plain | |
+ f2 | integer | | | | plain | |
Rules:
r1 AS
ON UPDATE TO rules_src DO INSERT INTO rules_log (f1, f2, tag) VALUES (old.f1,old.f2,'old'::text), (new.f1,new.f2,'new'::text)
line_tbl|f
log_table|f
lseg_tbl|f
-main_table|f
+main_table|t
+ mlparted|f
+ mlparted1|f
+ mlparted11|f
+ mlparted12|f
+ mlparted2|f
+ mlparted3|f
+ mlparted4|f
money_data|f
num_data|f
num_exp_add|t
pg_ts_template|t
pg_type|t
pg_user_mapping|t
+pgxc_class|t
+pgxc_group|t
+pgxc_node|t
point_tbl|t
polygon_tbl|t
+ quad_box_tbl|t
quad_point_tbl|t
radix_text_tbl|t
ramp|f
494 | 11 | 0 | 2 | 4 | 14 | 4 | 94 | 94 | 494 | 494 | 8 | 9 | ATAAAA | LAAAAA | VVVVxx
(1 row)
+ -- actually run the query with an analyze to use the partial index
+ explain (costs off, analyze on, timing off, summary off)
+ select * from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
+ QUERY PLAN
+ -----------------------------------------------------------------
+ Index Scan using onek2_u2_prtl on onek2 (actual rows=1 loops=1)
+ Index Cond: (unique2 = 11)
+ Filter: (stringu1 = 'ATAAAA'::name)
+ (3 rows)
+
explain (costs off)
select unique2 from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
- QUERY PLAN
------------------------------------------
- Index Scan using onek2_u2_prtl on onek2
- Index Cond: (unique2 = 11)
- Filter: (stringu1 = 'ATAAAA'::name)
-(3 rows)
+ QUERY PLAN
+-----------------------------------------------
+ Remote Fast Query Execution
+ Node/s: datanode_1, datanode_2
+ -> Index Scan using onek2_u2_prtl on onek2
+ Index Cond: (unique2 = 11)
+ Filter: (stringu1 = 'ATAAAA'::name)
+(5 rows)
select unique2 from onek2 where unique2 = 11 and stringu1 = 'ATAAAA';
unique2
explain (costs off)
select sum(parallel_restricted(unique1)) from tenk1
group by(parallel_restricted(unique1));
+ QUERY PLAN
+-----------------------------------------------------------
+ HashAggregate
+ Group Key: parallel_restricted(unique1)
+ -> Remote Subquery Scan on all (datanode_1,datanode_2)
+ -> Index Only Scan using tenk1_unique1 on tenk1
+(4 rows)
+
+ QUERY PLAN
+ -------------------------------------------------------------------
+ HashAggregate
+ Group Key: parallel_restricted(unique1)
+ -> Gather
+ Workers Planned: 4
+ -> Parallel Index Only Scan using tenk1_unique1 on tenk1
+ (5 rows)
+
+ -- test parallel plans for queries containing un-correlated subplans.
+ alter table tenk2 set (parallel_workers = 0);
+ explain (costs off)
+ select count(*) from tenk1 where (two, four) not in
+ (select hundred, thousand from tenk2 where thousand > 100);
+ QUERY PLAN
+ ------------------------------------------------------
+ Finalize Aggregate
+ -> Gather
+ Workers Planned: 4
+ -> Partial Aggregate
+ -> Parallel Seq Scan on tenk1
+ Filter: (NOT (hashed SubPlan 1))
+ SubPlan 1
+ -> Seq Scan on tenk2
+ Filter: (thousand > 100)
+ (9 rows)
+
+ select count(*) from tenk1 where (two, four) not in
+ (select hundred, thousand from tenk2 where thousand > 100);
+ count
+ -------
+ 10000
+ (1 row)
+
+ -- this is not parallel-safe due to use of random() within SubLink's testexpr:
+ explain (costs off)
+ select * from tenk1 where (unique1 + random())::integer not in
+ (select ten from tenk2);
+ QUERY PLAN
+ ------------------------------------
+ Seq Scan on tenk1
+ Filter: (NOT (hashed SubPlan 1))
+ SubPlan 1
+ -> Seq Scan on tenk2
+ (4 rows)
+
+ alter table tenk2 reset (parallel_workers);
+ -- test parallel index scans.
+ set enable_seqscan to off;
+ set enable_bitmapscan to off;
+ explain (costs off)
+ select count((unique1)) from tenk1 where hundred > 1;
+ QUERY PLAN
+ --------------------------------------------------------------------
+ Finalize Aggregate
+ -> Gather
+ Workers Planned: 4
+ -> Partial Aggregate
+ -> Parallel Index Scan using tenk1_hundred on tenk1
+ Index Cond: (hundred > 1)
+ (6 rows)
+
+ select count((unique1)) from tenk1 where hundred > 1;
+ count
+ -------
+ 9800
+ (1 row)
+
+ -- test parallel index-only scans.
+ explain (costs off)
+ select count(*) from tenk1 where thousand > 95;
+ QUERY PLAN
+ --------------------------------------------------------------------------------
+ Finalize Aggregate
+ -> Gather
+ Workers Planned: 4
+ -> Partial Aggregate
+ -> Parallel Index Only Scan using tenk1_thous_tenthous on tenk1
+ Index Cond: (thousand > 95)
+ (6 rows)
+
+ select count(*) from tenk1 where thousand > 95;
+ count
+ -------
+ 9040
+ (1 row)
+
+ reset enable_seqscan;
+ reset enable_bitmapscan;
+ -- test parallel bitmap heap scan.
+ set enable_seqscan to off;
+ set enable_indexscan to off;
+ set enable_hashjoin to off;
+ set enable_mergejoin to off;
+ set enable_material to off;
+ -- test prefetching, if the platform allows it
+ DO $$
+ BEGIN
+ SET effective_io_concurrency = 50;
+ EXCEPTION WHEN invalid_parameter_value THEN
+ END $$;
+ set work_mem='64kB'; --set small work mem to force lossy pages
+ explain (costs off)
+ select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0;
+ QUERY PLAN
+ ------------------------------------------------------------
+ Aggregate
+ -> Nested Loop
+ -> Seq Scan on tenk2
+ Filter: (thousand = 0)
+ -> Gather
+ Workers Planned: 4
+ -> Parallel Bitmap Heap Scan on tenk1
+ Recheck Cond: (hundred > 1)
+ -> Bitmap Index Scan on tenk1_hundred
+ Index Cond: (hundred > 1)
+ (10 rows)
+
+ select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0;
+ count
+ -------
+ 98000
+ (1 row)
+
+ create table bmscantest (a int, t text);
+ insert into bmscantest select r, 'fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' FROM generate_series(1,100000) r;
+ create index i_bmtest ON bmscantest(a);
+ select count(*) from bmscantest where a>1;
+ count
+ -------
+ 99999
+ (1 row)
+
+ reset enable_seqscan;
+ reset enable_indexscan;
+ reset enable_hashjoin;
+ reset enable_mergejoin;
+ reset enable_material;
+ reset effective_io_concurrency;
+ reset work_mem;
+ drop table bmscantest;
+ -- test parallel merge join path.
+ set enable_hashjoin to off;
+ set enable_nestloop to off;
+ explain (costs off)
+ select count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
+ QUERY PLAN
+ -------------------------------------------------------------------------------
+ Finalize Aggregate
+ -> Gather
+ Workers Planned: 4
+ -> Partial Aggregate
+ -> Merge Join
+ Merge Cond: (tenk1.unique1 = tenk2.unique1)
+ -> Parallel Index Only Scan using tenk1_unique1 on tenk1
+ -> Index Only Scan using tenk2_unique1 on tenk2
+ (8 rows)
+
+ select count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
+ count
+ -------
+ 10000
+ (1 row)
+
+ reset enable_hashjoin;
+ reset enable_nestloop;
+ --test gather merge
+ set enable_hashagg to off;
+ explain (costs off)
+ select string4, count((unique2)) from tenk1 group by string4 order by string4;
+ QUERY PLAN
+ ----------------------------------------------------
+ Finalize GroupAggregate
+ Group Key: string4
+ -> Gather Merge
+ Workers Planned: 4
+ -> Partial GroupAggregate
+ Group Key: string4
+ -> Sort
+ Sort Key: string4
+ -> Parallel Seq Scan on tenk1
+ (9 rows)
+
+ select string4, count((unique2)) from tenk1 group by string4 order by string4;
+ string4 | count
+ ---------+-------
+ AAAAxx | 2500
+ HHHHxx | 2500
+ OOOOxx | 2500
+ VVVVxx | 2500
+ (4 rows)
+
+ reset enable_hashagg;
set force_parallel_mode=1;
explain (costs off)
select stringu1::int2 from tenk1 where unique1 = 1;
- QUERY PLAN
------------------------------------------------
- Gather
- Workers Planned: 1
- Single Copy: true
- -> Index Scan using tenk1_unique1 on tenk1
- Index Cond: (unique1 = 1)
-(5 rows)
+ QUERY PLAN
+-----------------------------------------------------
+ Remote Fast Query Execution
+ Node/s: datanode_1
+ -> Gather
+ Workers Planned: 1
+ Single Copy: true
+ -> Index Scan using tenk1_unique1 on tenk1
+ Index Cond: (unique1 = 1)
+(7 rows)
+
+do $$begin
+ -- Provoke error, possibly in worker. If this error happens to occur in
+ -- the worker, there will be a CONTEXT line which must be hidden.
+ perform stringu1::int2 from tenk1 where unique1 = 1;
+ exception
+ when others then
+ raise 'SQLERRM: %', sqlerrm;
+end$$;
+ERROR: Internal subtransactions not supported in Postgres-XL
+CONTEXT: PL/pgSQL function inline_code_block line 1 during statement block entry
+
+ -- to increase the parallel query test coverage
+ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
+ QUERY PLAN
+ -------------------------------------------------------------
+ Gather (actual rows=10000 loops=1)
+ Workers Planned: 4
+ Workers Launched: 4
+ -> Parallel Seq Scan on tenk1 (actual rows=2000 loops=5)
+ (4 rows)
+
+ -- provoke error in worker
+ select stringu1::int2 from tenk1 where unique1 = 1;
+ ERROR: invalid input syntax for integer: "BAAAAA"
+ CONTEXT: parallel worker
rollback;
Whitlock Creek | [(-121.74683,37.91276),(-121.733107,37)] | Oakland
Whitlock Creek | [(-121.74683,37.91276),(-121.733107,37)] | Oakland
Willimet Way | [(-122.0964,37.517),(-122.0949,37.493)] | Oakland
- Wisconsin St | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)] | Oakland
Wisconsin St | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)] | Berkeley
+ Wisconsin St | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)] | Oakland
Wp Railroad | [(-122.254,37.902),(-122.2506,37.891)] | Berkeley
- 100th Ave | [(-122.1657,37.429),(-122.1647,37.432)] | Oakland
- 107th Ave | [(-122.1555,37.403),(-122.1531,37.41)] | Oakland
- 14th St | [(-122.299,37.147),(-122.3,37.148)] | Lafayette
- 19th Ave | [(-122.2366,37.897),(-122.2359,37.905)] | Berkeley
- 1st St | [(-121.75508,37.89294),(-121.753581,37.90031)] | Oakland
- 5th St | [(-122.278,37),(-122.2792,37.005),(-122.2803,37.009)] | Lafayette
- 5th St | [(-122.296,37.615),(-122.2953,37.598)] | Berkeley
- 82nd Ave | [(-122.1695,37.596),(-122.1681,37.603)] | Berkeley
- 85th Ave | [(-122.1877,37.466),(-122.186,37.476)] | Oakland
- 89th Ave | [(-122.1822,37.459),(-122.1803,37.471)] | Oakland
- 98th Ave | [(-122.1568,37.498),(-122.1558,37.502)] | Oakland
- 98th Ave | [(-122.1693,37.438),(-122.1682,37.444)] | Oakland
- 98th Ave | [(-122.2001,37.258),(-122.1974,37.27)] | Lafayette
(333 rows)
- SELECT name, #thepath FROM iexit ORDER BY 1, 2;
- name | ?column?
- ------+----------
- (0 rows)
-SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2;
- name | ?column?
-------------------------------------+----------
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 2
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 3
- I- 580 | 4
- I- 580 | 4
- I- 580 | 4
- I- 580 | 4
- I- 580 | 5
- I- 580 | 5
- I- 580 | 5
- I- 580 | 5
- I- 580 | 5
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 6
- I- 580 | 7
- I- 580 | 7
- I- 580 | 7
- I- 580 | 7
- I- 580 | 7
- I- 580 | 7
- I- 580 | 7
- I- 580 | 8
- I- 580 | 8
- I- 580 | 8
- I- 580 | 8
- I- 580 | 8
- I- 580 | 8
- I- 580 | 8
- I- 580 | 8
- I- 580 | 8
- I- 580 | 9
- I- 580 | 9
- I- 580 | 9
- I- 580 | 9
- I- 580 | 9
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 12
- I- 580 | 13
- I- 580 | 13
- I- 580 | 13
- I- 580 | 13
- I- 580 | 13
- I- 580 | 13
- I- 580 | 14
- I- 580 | 14
- I- 580 | 14
- I- 580 | 14
- I- 580 | 14
- I- 580 | 14
- I- 580 | 14
- I- 580 | 14
- I- 580 | 18
- I- 580 | 18
- I- 580 | 18
- I- 580 | 18
- I- 580 | 18
- I- 580 | 18
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 21
- I- 580 | 22
- I- 580 | 22
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 2
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 3
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 4
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 5
- I- 580 Ramp | 6
- I- 580 Ramp | 6
- I- 580 Ramp | 6
- I- 580 Ramp | 7
- I- 580 Ramp | 8
- I- 580 Ramp | 8
- I- 580 Ramp | 8
- I- 580 Ramp | 8
- I- 580 Ramp | 8
- I- 580 Ramp | 8
- I- 580/I-680 Ramp | 2
- I- 580/I-680 Ramp | 2
- I- 580/I-680 Ramp | 2
- I- 580/I-680 Ramp | 2
- I- 580/I-680 Ramp | 2
- I- 580/I-680 Ramp | 2
- I- 580/I-680 Ramp | 4
- I- 580/I-680 Ramp | 4
- I- 580/I-680 Ramp | 4
- I- 580/I-680 Ramp | 4
- I- 580/I-680 Ramp | 5
- I- 580/I-680 Ramp | 6
- I- 580/I-680 Ramp | 6
- I- 580/I-680 Ramp | 6
- I- 680 | 2
- I- 680 | 2
- I- 680 | 2
- I- 680 | 2
- I- 680 | 2
- I- 680 | 2
- I- 680 | 2
- I- 680 | 3
- I- 680 | 3
- I- 680 | 3
- I- 680 | 4
- I- 680 | 4
- I- 680 | 4
- I- 680 | 5
- I- 680 | 5
- I- 680 | 5
- I- 680 | 7
- I- 680 | 7
- I- 680 | 7
- I- 680 | 7
- I- 680 | 8
- I- 680 | 8
- I- 680 | 8
- I- 680 | 8
- I- 680 | 10
- I- 680 | 10
- I- 680 | 10
- I- 680 | 10
- I- 680 | 10
- I- 680 | 10
- I- 680 | 10
- I- 680 | 16
- I- 680 | 16
- I- 680 | 16
- I- 680 | 16
- I- 680 | 16
- I- 680 | 16
- I- 680 | 16
- I- 680 | 16
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 2
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 3
- I- 680 Ramp | 4
- I- 680 Ramp | 4
- I- 680 Ramp | 4
- I- 680 Ramp | 5
- I- 680 Ramp | 5
- I- 680 Ramp | 5
- I- 680 Ramp | 5
- I- 680 Ramp | 5
- I- 680 Ramp | 5
- I- 680 Ramp | 6
- I- 680 Ramp | 6
- I- 680 Ramp | 6
- I- 680 Ramp | 6
- I- 680 Ramp | 7
- I- 680 Ramp | 7
- I- 680 Ramp | 7
- I- 680 Ramp | 7
- I- 680 Ramp | 8
- I- 680 Ramp | 8
- I- 680 Ramp | 8
- I- 680 Ramp | 8
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 2
- I- 80 | 3
- I- 80 | 3
- I- 80 | 3
- I- 80 | 4
- I- 80 | 4
- I- 80 | 4
- I- 80 | 4
- I- 80 | 4
- I- 80 | 5
- I- 80 | 5
- I- 80 | 5
- I- 80 | 5
- I- 80 | 5
- I- 80 | 5
- I- 80 | 5
- I- 80 | 5
- I- 80 | 5
- I- 80 | 11
- I- 80 | 11
- I- 80 | 11
- I- 80 | 11
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 2
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 3
- I- 80 Ramp | 4
- I- 80 Ramp | 4
- I- 80 Ramp | 4
- I- 80 Ramp | 4
- I- 80 Ramp | 5
- I- 80 Ramp | 5
- I- 80 Ramp | 5
- I- 80 Ramp | 5
- I- 80 Ramp | 5
- I- 80 Ramp | 5
- I- 80 Ramp | 5
- I- 80 Ramp | 7
- I- 80 Ramp | 7
- I- 80 Ramp | 7
- I- 80 Ramp | 7
- I- 880 | 2
- I- 880 | 2
- I- 880 | 2
- I- 880 | 2
- I- 880 | 2
- I- 880 | 5
- I- 880 | 5
- I- 880 | 5
- I- 880 | 5
- I- 880 | 5
- I- 880 | 5
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 6
- I- 880 | 7
- I- 880 | 7
- I- 880 | 7
- I- 880 | 7
- I- 880 | 7
- I- 880 | 7
- I- 880 | 7
- I- 880 | 9
- I- 880 | 9
- I- 880 | 9
- I- 880 | 9
- I- 880 | 9
- I- 880 | 9
- I- 880 | 9
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 10
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 12
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 13
- I- 880 | 14
- I- 880 | 14
- I- 880 | 14
- I- 880 | 14
- I- 880 | 14
- I- 880 | 14
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 17
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 | 19
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 2
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 3
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 4
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 5
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 6
- I- 880 Ramp | 8
- I- 880 Ramp | 8
- I- 880 Ramp | 8
- I- 980 | 2
- I- 980 | 2
- I- 980 | 2
- I- 980 | 2
- I- 980 | 2
- I- 980 | 2
- I- 980 | 2
- I- 980 | 2
- I- 980 | 3
- I- 980 | 3
- I- 980 | 3
- I- 980 | 3
- I- 980 | 3
- I- 980 | 3
- I- 980 | 3
- I- 980 | 3
- I- 980 | 3
- I- 980 | 4
- I- 980 | 4
- I- 980 | 5
- I- 980 | 5
- I- 980 | 7
- I- 980 | 7
- I- 980 | 7
- I- 980 | 7
- I- 980 | 12
- I- 980 Ramp | 3
- I- 980 Ramp | 3
- I- 980 Ramp | 3
- I- 980 Ramp | 7
-(896 rows)
SELECT * FROM toyemp WHERE name = 'sharon';
name | age | location | annualsal
DROP USER regress_seq_user;
DROP SEQUENCE seq;
+create table test_seqtab (unique1 int, unique2 int);
+insert into test_seqtab select i, i from generate_series(1,1000) s(i);
+create temp sequence testseq;
+select distinct(nextval('testseq'))
+ from test_seqtab order by 1 limit 10;
+ nextval
+---------
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+ 10
+(10 rows)
+
+drop table test_seqtab;
+ -- cache tests
+ CREATE SEQUENCE test_seq1 CACHE 10;
+ SELECT nextval('test_seq1');
+ nextval
+ ---------
+ 1
+ (1 row)
+
+ SELECT nextval('test_seq1');
+ nextval
+ ---------
+ 2
+ (1 row)
+
+ SELECT nextval('test_seq1');
+ nextval
+ ---------
+ 3
+ (1 row)
+
+ DROP SEQUENCE test_seq1;
11
(1 row)
+SELECT setseed(0);
+ setseed
+---------
+
+(1 row)
+
+-- DROP TABLE IF EXISTS asd ;
+CREATE TABLE IF NOT EXISTS asd AS
+SELECT clientid::numeric(20),
+ (clientid / 20 )::integer::numeric(20) as userid,
+ cts + ((random()* 3600 *24 )||'sec')::interval as cts,
+ (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state,
+ 0 as dim,
+ ((ARRAY['Cat','Dog','Duck'])[(clientid / 10 )% 3 +1 ]) ::text as app_name,
+ ((ARRAY['A','B'])[(clientid / 10 )% 2 +1 ]) ::text as platform
+ FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t
+;
+SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid,
+ B.state as state
+FROM ( VALUES
+('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') ,
+('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03')
+) AS D (dates)
+JOIN
+( SELECT DISTINCT clientid FROM asd
+ WHERE userid=74 ) C ON True
+INNER JOIN LATERAL (
+ SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.*
+ FROM asd x
+ INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim ,
+ MAX(p.cts) AS selected_cts
+ FROM asd p
+ where cts<D.dates::timestamp and state in
+ ('A','B')
+ GROUP BY p.clientid,p.app_name,p.platform,p.state,p.dim) y
+ ON y.clientid = x.clientid
+ AND y.selected_cts = x.cts
+ AND y.platform = x.platform
+ AND y.app_name=x.app_name
+ AND y.state=x.state
+ AND y.dim = x.dim
+ and x.clientid = C.clientid
+) B ON True
+ORDER BY dates desc, state;
+ dates | platform | app_name | clientid | userid | state
+--------------------------+----------+----------+----------+--------+-------
+ Tue Aug 30 08:52:43 2016 | A | Dog | 1480 | 74 | A
+ Tue Aug 30 08:52:43 2016 | B | Duck | 1490 | 74 | A
+ Tue Aug 30 08:52:43 2016 | A | Dog | 1480 | 74 | B
+ Tue Aug 30 08:52:43 2016 | B | Duck | 1490 | 74 | B
+ Mon Aug 29 04:57:12 2016 | A | Dog | 1480 | 74 | A
+ Mon Aug 29 04:57:12 2016 | B | Duck | 1490 | 74 | A
+ Mon Aug 29 04:57:12 2016 | A | Dog | 1480 | 74 | B
+ Mon Aug 29 04:57:12 2016 | B | Duck | 1490 | 74 | B
+ Fri Aug 26 08:15:05 2016 | B | Duck | 1490 | 74 | A
+ Fri Aug 26 08:15:05 2016 | A | Dog | 1480 | 74 | A
+ Fri Aug 26 08:15:05 2016 | B | Duck | 1490 | 74 | B
+ Fri Aug 26 08:15:05 2016 | A | Dog | 1480 | 74 | B
+ Wed Aug 24 11:49:51 2016 | A | Dog | 1480 | 74 | A
+ Wed Aug 24 11:49:51 2016 | B | Duck | 1490 | 74 | A
+ Wed Aug 24 11:49:51 2016 | A | Dog | 1480 | 74 | B
+ Wed Aug 24 11:49:51 2016 | B | Duck | 1490 | 74 | B
+ Mon Aug 22 08:45:29 2016 | B | Duck | 1490 | 74 | A
+ Mon Aug 22 08:45:29 2016 | A | Dog | 1480 | 74 | A
+ Mon Aug 22 08:45:29 2016 | B | Duck | 1490 | 74 | B
+ Mon Aug 22 08:45:29 2016 | A | Dog | 1480 | 74 | B
+ Sun Aug 21 04:53:47 2016 | B | Duck | 1490 | 74 | A
+ Sun Aug 21 04:53:47 2016 | A | Dog | 1480 | 74 | A
+ Sun Aug 21 04:53:47 2016 | B | Duck | 1490 | 74 | B
+ Sun Aug 21 04:53:47 2016 | A | Dog | 1480 | 74 | B
+ Sat Aug 20 08:44:03 2016 | A | Dog | 1480 | 74 | A
+ Sat Aug 20 08:44:03 2016 | B | Duck | 1490 | 74 | A
+ Sat Aug 20 08:44:03 2016 | B | Duck | 1490 | 74 | B
+ Sat Aug 20 08:44:03 2016 | A | Dog | 1480 | 74 | B
+(28 rows)
+
+DROP TABLE asd;
+SELECT setseed(0);
+ setseed
+---------
+
+(1 row)
+
+ --
+ -- Check that volatile quals aren't pushed down past a set-returning function;
+ -- while a nonvolatile qual can be, if it doesn't reference the SRF.
+ --
+ create function tattle(x int, y int) returns bool
+ volatile language plpgsql as $$
+ begin
+ raise notice 'x = %, y = %', x, y;
+ return x > y;
+ end$$;
+ explain (verbose, costs off)
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+ QUERY PLAN
+ ----------------------------------------------------------
+ Subquery Scan on ss
+ Output: x, u
+ Filter: tattle(ss.x, 8)
+ -> ProjectSet
+ Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+ -> Result
+ (6 rows)
+
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+ NOTICE: x = 9, y = 8
+ NOTICE: x = 9, y = 8
+ NOTICE: x = 9, y = 8
+ NOTICE: x = 9, y = 8
+ NOTICE: x = 9, y = 8
+ NOTICE: x = 9, y = 8
+ x | u
+ ---+----
+ 9 | 1
+ 9 | 2
+ 9 | 3
+ 9 | 11
+ 9 | 12
+ 9 | 13
+ (6 rows)
+
+ -- if we pretend it's stable, we get different results:
+ alter function tattle(x int, y int) stable;
+ explain (verbose, costs off)
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+ QUERY PLAN
+ ----------------------------------------------------
+ ProjectSet
+ Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+ -> Result
+ One-Time Filter: tattle(9, 8)
+ (4 rows)
+
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+ NOTICE: x = 9, y = 8
+ x | u
+ ---+----
+ 9 | 1
+ 9 | 2
+ 9 | 3
+ 9 | 11
+ 9 | 12
+ 9 | 13
+ (6 rows)
+
+ -- although even a stable qual should not be pushed down if it references SRF
+ explain (verbose, costs off)
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, u);
+ QUERY PLAN
+ ----------------------------------------------------------
+ Subquery Scan on ss
+ Output: x, u
+ Filter: tattle(ss.x, ss.u)
+ -> ProjectSet
+ Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+ -> Result
+ (6 rows)
+
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, u);
+ NOTICE: x = 9, y = 1
+ NOTICE: x = 9, y = 2
+ NOTICE: x = 9, y = 3
+ NOTICE: x = 9, y = 11
+ NOTICE: x = 9, y = 12
+ NOTICE: x = 9, y = 13
+ x | u
+ ---+---
+ 9 | 1
+ 9 | 2
+ 9 | 3
+ (3 rows)
+
+ drop function tattle(x int, y int);
4567890123456789
(2 rows)
-(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl ORDER BY 1))) UNION ALL SELECT q2 FROM int8_tbl;
+(((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl ORDER BY 1;
q1
-------------------
+ -4567890123456789
+ 123
123
+ 4567890123456789
456
4567890123456789
- 123
4567890123456789
- -4567890123456789
+ 4567890123456789
(7 rows)
SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
-- Should succeed
DROP TABLESPACE regress_tblspace_renamed;
DROP SCHEMA testschema CASCADE;
-NOTICE: drop cascades to 5 other objects
+NOTICE: drop cascades to 3 other objects
DETAIL: drop cascades to table testschema.foo
drop cascades to table testschema.asselect
-drop cascades to table testschema.asexecute
drop cascades to table testschema.atable
+ drop cascades to table testschema.tablespace_acl
DROP ROLE regress_tablespace_user1;
DROP ROLE regress_tablespace_user2;
# NB: temp.sql does a reconnect which transiently uses 2 connections,
# so keep this parallel group to at most 19 tests
# ----------
-test: plancache limit plpgsql copy2 temp domain rangefuncs prepare without_oid conversion truncate alter_table sequence polymorphism rowtypes returning largeobject with xml
+test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion truncate alter_table sequence polymorphism rowtypes returning largeobject with xml
+ # ----------
+ # Another group of parallel tests
+ # ----------
+ test: identity
+
# event triggers cannot run concurrently with any test that runs DDL
test: event_trigger
/* initdb */
header(_("initializing database system"));
+#ifdef PGXC
+ /* Initialize nodes and GTM */
+ initdb_node(PGXC_GTM);
+ initdb_node(PGXC_COORD_1);
+ initdb_node(PGXC_COORD_2);
+ initdb_node(PGXC_DATANODE_1);
+ initdb_node(PGXC_DATANODE_2);
+#else
snprintf(buf, sizeof(buf),
- "\"%s%sinitdb\" -D \"%s/data\" --noclean --nosync%s%s > \"%s/log/initdb.log\" 2>&1",
+ "\"%s%sinitdb\" -D \"%s/data\" --no-clean --no-sync%s%s > \"%s/log/initdb.log\" 2>&1",
bindir ? bindir : "",
bindir ? "/" : "",
temp_instance,
select null::int = all ('{1,2,3}');
select 33 = all ('{1,null,3}');
select 33 = all ('{33,null,33}');
+ -- nulls later in the bitmap
+ SELECT -1 != ALL(ARRAY(SELECT NULLIF(g.i, 900) FROM generate_series(1,1000) g(i)));
-- test indexes on arrays
-create temp table arr_tbl (f1 int[] unique);
+-- PGXCTODO: related to feature request 3520520, this distribution type is changed
+-- to replication. As integer arrays are no available distribution types, this table
+-- should use roundrobin distribution if nothing is specified but roundrobin
+-- distribution cannot be safely used to check constraints on remote nodes.
+-- When global constraints are supported, this replication distribution should be removed.
+create temp table arr_tbl (f1 int[] unique) distribute by replication;
insert into arr_tbl values ('{1,2,3}');
insert into arr_tbl values ('{1,2}');
-- failure expected:
COMMIT;
-SELECT ctid,cmin,* FROM combocidtest;
+SELECT ctid,cmin,* FROM combocidtest ORDER BY ctid;
+
+ -- test for bug reported in
+ -- CABRT9RC81YUf1=jsmWopcKJEro=VoeG2ou6sPwyOUTx_qteRsg@mail.gmail.com
+ CREATE TABLE IF NOT EXISTS testcase(
+ id int PRIMARY KEY,
+ balance numeric
+ );
+ INSERT INTO testcase VALUES (1, 0);
+ BEGIN;
+ SELECT * FROM testcase WHERE testcase.id = 1 FOR UPDATE;
+ UPDATE testcase SET balance = balance + 400 WHERE id=1;
+ SAVEPOINT subxact;
+ UPDATE testcase SET balance = balance - 100 WHERE id=1;
+ ROLLBACK TO SAVEPOINT subxact;
+ -- should return one tuple
+ SELECT * FROM testcase WHERE id = 1 FOR UPDATE;
+ ROLLBACK;
+ DROP TABLE testcase;
SELECT * FROM inhg; /* Two records with three columns in order x=x, xx=text, y=y */
DROP TABLE inhg;
-CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text); /* copies indexes */
+ CREATE TABLE test_like_id_1 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ \d test_like_id_1
+ INSERT INTO test_like_id_1 (b) VALUES ('b1');
+ SELECT * FROM test_like_id_1;
+ CREATE TABLE test_like_id_2 (LIKE test_like_id_1);
+ \d test_like_id_2
+ INSERT INTO test_like_id_2 (b) VALUES ('b2');
+ SELECT * FROM test_like_id_2; -- identity was not copied
+ CREATE TABLE test_like_id_3 (LIKE test_like_id_1 INCLUDING IDENTITY);
+ \d test_like_id_3
+ INSERT INTO test_like_id_3 (b) VALUES ('b3');
+ SELECT * FROM test_like_id_3; -- identity was copied and applied
+ DROP TABLE test_like_id_1, test_like_id_2, test_like_id_3;
+
+CREATE TABLE inhg (x text, LIKE inhx INCLUDING INDEXES, y text) DISTRIBUTE BY REPLICATION; /* copies indexes */
INSERT INTO inhg VALUES (5, 10);
INSERT INTO inhg VALUES (20, 10); -- should fail
DROP TABLE inhg;
explain (costs off) delete from t1 where a = 1;
delete from t1 where a = 1;
+drop rule r1 on t1;
+
+explain (costs off, nodes off) delete from t1 where a = 1;
+delete from t1 where a = 1;
+ --
+ -- Test deferred FK check on a tuple deleted by a rolled-back subtransaction
+ --
+ create table pktable2(f1 int primary key);
+ create table fktable2(f1 int references pktable2 deferrable initially deferred);
+ insert into pktable2 values(1);
+
+ begin;
+ insert into fktable2 values(1);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit;
+
+ begin;
+ insert into fktable2 values(2);
+ savepoint x;
+ delete from fktable2;
+ rollback to x;
+ commit; -- fail
+
+ --
+ -- Test that we prevent dropping FK constraint with pending trigger events
+ --
+ begin;
+ insert into fktable2 values(2);
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ commit;
+
+ begin;
+ delete from pktable2 where f1 = 1;
+ alter table fktable2 drop constraint fktable2_f1_fkey;
+ commit;
+
+ drop table pktable2, fktable2;
SET enable_seqscan TO on;
DROP INDEX inet_idx2;
+ -- check that spgist index works correctly
+ CREATE INDEX inet_idx3 ON inet_tbl using spgist (i);
+ SET enable_seqscan TO off;
+ SELECT * FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <<= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i && '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >>= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >> '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i < '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i = '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i >= '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i > '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT * FROM inet_tbl WHERE i <> '192.168.1.0/24'::cidr ORDER BY i;
+
+ -- test index-only scans
+ EXPLAIN (COSTS OFF)
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+ SELECT i FROM inet_tbl WHERE i << '192.168.1.0/24'::cidr ORDER BY i;
+
+ SET enable_seqscan TO on;
+ DROP INDEX inet_idx3;
+
-- simple tests of inet boolean and arithmetic operators
-SELECT i, ~i AS "~i" FROM inet_tbl;
-SELECT i, c, i & c AS "and" FROM inet_tbl;
-SELECT i, c, i | c AS "or" FROM inet_tbl;
-SELECT i, i + 500 AS "i+500" FROM inet_tbl;
-SELECT i, i - 500 AS "i-500" FROM inet_tbl;
-SELECT i, c, i - c AS "minus" FROM inet_tbl;
+SELECT i, ~i AS "~i" FROM inet_tbl ORDER BY i;
+SELECT i, c, i & c AS "and" FROM inet_tbl ORDER BY i, c;
+SELECT i, c, i | c AS "or" FROM inet_tbl ORDER BY i, c;
+SELECT i, i + 500 AS "i+500" FROM inet_tbl ORDER BY i;
+SELECT i, i - 500 AS "i-500" FROM inet_tbl ORDER BY i;
+SELECT i, c, i - c AS "minus" FROM inet_tbl ORDER BY i, c;
SELECT '127.0.0.1'::inet + 257;
SELECT ('127.0.0.1'::inet + 257) - 257;
SELECT '127::1'::inet + 257;
( select f1 from foo union all select f1+3 from foo ) ss
where bar.f1 = ss.f1;
-select tableoid::regclass::text as relname, bar.* from bar order by 1,2;
+--select tableoid::regclass::text as relname, bar.* from bar order by 1,2;
+ -- Check UPDATE with *partitioned* inherited target and an appendrel subquery
+ create table some_tab (a int);
+ insert into some_tab values (0);
+ create table some_tab_child () inherits (some_tab);
+ insert into some_tab_child values (1);
+ create table parted_tab (a int, b char) partition by list (a);
+ create table parted_tab_part1 partition of parted_tab for values in (1);
+ create table parted_tab_part2 partition of parted_tab for values in (2);
+ create table parted_tab_part3 partition of parted_tab for values in (3);
+ insert into parted_tab values (1, 'a'), (2, 'a'), (3, 'a');
+
+ update parted_tab set b = 'b'
+ from
+ (select a from some_tab union all select a+1 from some_tab) ss (a)
+ where parted_tab.a = ss.a;
+ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2;
+
+ truncate parted_tab;
+ insert into parted_tab values (1, 'a'), (2, 'a'), (3, 'a');
+ update parted_tab set b = 'b'
+ from
+ (select 0 from parted_tab union all select 1 from parted_tab) ss (a)
+ where parted_tab.a = ss.a;
+ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2;
+
+ drop table parted_tab;
+ drop table some_tab cascade;
+
/* Test multiple inheritance of column defaults */
CREATE TABLE firstparent (tomorrow date default now()::date + 1);
tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
-- lateral injecting a strange outer join condition
-explain (costs off)
+explain (num_nodes off, nodes off, costs off)
select * from int8_tbl a,
int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
- on x.q2 = ss.z;
+ on x.q2 = ss.z
+ order by a.q1, a.q2, x.q1, x.q2, ss.z;
select * from int8_tbl a,
int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
- on x.q2 = ss.z;
+ on x.q2 = ss.z
+ order by a.q1, a.q2, x.q1, x.q2, ss.z;
-- lateral reference to a join alias variable
select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
delete from xx1 using (select * from int4_tbl where f1 = xx1.x1) ss;
delete from xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
+-- demonstrate problem with extrememly slow join
+CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION;
+INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000);
+CREATE TABLE testh (a int, b int);
+INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000);
+set enable_mergejoin TO false;
+set enable_hashjoin TO false;
+EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
++
+ --
+ -- test planner's ability to mark joins as unique
+ --
+
+ create table j1 (id int primary key);
+ create table j2 (id int primary key);
+ create table j3 (id int);
+
+ insert into j1 values(1),(2),(3);
+ insert into j2 values(1),(2),(3);
+ insert into j3 values(1),(1);
+
+ analyze j1;
+ analyze j2;
+ analyze j3;
+
+ -- ensure join is properly marked as unique
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id = j2.id;
+
+ -- ensure join is not unique when not an equi-join
+ explain (verbose, costs off)
+ select * from j1 inner join j2 on j1.id > j2.id;
+
+ -- ensure non-unique rel is not chosen as inner
+ explain (verbose, costs off)
+ select * from j1 inner join j3 on j1.id = j3.id;
+
+ -- ensure left join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 left join j2 on j1.id = j2.id;
+
+ -- ensure right join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 right join j2 on j1.id = j2.id;
+
+ -- ensure full join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 full join j2 on j1.id = j2.id;
+
+ -- a clauseless (cross) join can't be unique
+ explain (verbose, costs off)
+ select * from j1 cross join j2;
+
+ -- ensure a natural join is marked as unique
+ explain (verbose, costs off)
+ select * from j1 natural join j2;
+
+ -- ensure a distinct clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select distinct id from j3) j3 on j1.id = j3.id;
+
+ -- ensure group by clause allows the inner to become unique
+ explain (verbose, costs off)
+ select * from j1
+ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
+
+ drop table j1;
+ drop table j2;
+ drop table j3;
+
+ -- test more complex permutations of unique joins
+
+ create table j1 (id1 int, id2 int, primary key(id1,id2));
+ create table j2 (id1 int, id2 int, primary key(id1,id2));
+ create table j3 (id1 int, id2 int, primary key(id1,id2));
+
+ insert into j1 values(1,1),(1,2);
+ insert into j2 values(1,1);
+ insert into j3 values(1,1);
+
+ analyze j1;
+ analyze j2;
+ analyze j3;
+
+ -- ensure there's no unique join when not all columns which are part of the
+ -- unique index are seen in the join clause
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1;
+
+ -- ensure proper unique detection with multiple join quals
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
+
+ -- ensure we don't detect the join to be unique when quals are not part of the
+ -- join condition
+ explain (verbose, costs off)
+ select * from j1
+ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+
+ -- as above, but for left joins.
+ explain (verbose, costs off)
+ select * from j1
+ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+
+ -- validate logic in merge joins which skips mark and restore.
+ -- it should only do this if all quals which were used to detect the unique
+ -- are present as join quals, and not plain quals.
+ set enable_nestloop to 0;
+ set enable_hashjoin to 0;
+ set enable_sort to 0;
+
+ -- create an index that will be preferred over the PK to perform the join
+ create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
+
+ explain (costs off) select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+
+ select * from j1 j1
+ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+
+ reset enable_nestloop;
+ reset enable_hashjoin;
+ reset enable_sort;
+
+ drop table j1;
+ drop table j2;
+ drop table j3;
+
+ -- check that semijoin inner is not seen as unique for a portion of the outerrel
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from tenk1 t3
+ where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+ and t1.unique1 < 1;
+
+ -- ... unless it actually is unique
+ create table j3 as select unique1, tenthous from onek;
+ vacuum analyze j3;
+ create unique index on j3(unique1, tenthous);
+
+ explain (verbose, costs off)
+ select t1.unique1, t2.hundred
+ from onek t1, tenk1 t2
+ where exists (select 1 from j3
+ where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
+ and t1.unique1 < 1;
+
+ drop table j3;
SELECT (-12345)::money;
SELECT (-1234567890)::money;
SELECT (-12345678901234567)::money;
- SELECT (-123456789012345678)::money;
- SELECT (-9223372036854775808)::money;
SELECT 1234567890::int4::money;
SELECT 12345678901234567::int8::money;
+ SELECT 12345678901234567::numeric::money;
SELECT (-1234567890)::int4::money;
SELECT (-12345678901234567)::int8::money;
+
+INSERT INTO money_data VALUES ('$223.459');
+INSERT INTO money_data VALUES ('$323.459');
+INSERT INTO money_data VALUES ('$423.459');
+INSERT INTO money_data VALUES ('$523.459');
+SELECT sum(m) FROM money_data;
+
+CREATE TABLE money_data2 (a int, m money);
+INSERT INTO money_data2 VALUES (1, '$123.459');
+INSERT INTO money_data2 VALUES (2, '$223.459');
+INSERT INTO money_data2 VALUES (3, '$323.459');
+INSERT INTO money_data2 VALUES (4, '$423.459');
+INSERT INTO money_data2 VALUES (5, '$523.459');
+SELECT sum(m) FROM money_data2;
+DROP TABLE money_data2;
++
+ SELECT (-12345678901234567)::numeric::money;
+
+ -- Cast from money
+ SELECT '12345678901234567'::money::numeric;
+ SELECT '-12345678901234567'::money::numeric;
end;
$$;
+
+-- Check parameter handling
+BEGIN;
+DROP TABLE IF EXISTS testcase_13;
+CREATE TABLE testcase_13 (patient_id integer);
+INSERT INTO testcase_13 VALUES (1);
+DO $$
+DECLARE
+ r RECORD;
+BEGIN
+FOR r IN SELECT * FROM testcase_13 LOOP
+ RAISE INFO 'r.patient_id=%', r.patient_id;
+ IF (SELECT EXISTS (
+ SELECT FROM testcase_13 WHERE patient_id = r.patient_id
+ ))
+ THEN
+ RAISE INFO 'condition true';
+ END IF;
+ END LOOP;
+END $$;
+ROLLBACK;
++
+ -- Test use of plpgsql in a domain check constraint (cf. bug #14414)
+
+ create function plpgsql_domain_check(val int) returns boolean as $$
+ begin return val > 0; end
+ $$ language plpgsql immutable;
+
+ create domain plpgsql_domain as integer check(plpgsql_domain_check(value));
+
+ do $$
+ declare v_test plpgsql_domain;
+ begin
+ v_test := 1;
+ end;
+ $$;
+
+ do $$
+ declare v_test plpgsql_domain := 1;
+ begin
+ v_test := 0; -- fail
+ end;
+ $$;
+
+ -- Test handling of expanded array passed to a domain constraint (bug #14472)
+
+ create function plpgsql_arr_domain_check(val int[]) returns boolean as $$
+ begin return val[1] > 0; end
+ $$ language plpgsql immutable;
+
+ create domain plpgsql_arr_domain as int[] check(plpgsql_arr_domain_check(value));
+
+ do $$
+ declare v_test plpgsql_arr_domain;
+ begin
+ v_test := array[1];
+ v_test := v_test || 2;
+ end;
+ $$;
+
+ do $$
+ declare v_test plpgsql_arr_domain := array[1];
+ begin
+ v_test := 0 || v_test; -- fail
+ end;
+ $$;
+
+ --
+ -- test usage of transition tables in AFTER triggers
+ --
+
+ CREATE TABLE transition_table_base (id int PRIMARY KEY, val text);
+
+ CREATE FUNCTION transition_table_base_ins_func()
+ RETURNS trigger
+ LANGUAGE plpgsql
+ AS $$
+ DECLARE
+ t text;
+ l text;
+ BEGIN
+ t = '';
+ FOR l IN EXECUTE
+ $q$
+ EXPLAIN (TIMING off, COSTS off, VERBOSE on)
+ SELECT * FROM newtable
+ $q$ LOOP
+ t = t || l || E'\n';
+ END LOOP;
+
+ RAISE INFO '%', t;
+ RETURN new;
+ END;
+ $$;
+
+ CREATE TRIGGER transition_table_base_ins_trig
+ AFTER INSERT ON transition_table_base
+ REFERENCING OLD TABLE AS oldtable NEW TABLE AS newtable
+ FOR EACH STATEMENT
+ EXECUTE PROCEDURE transition_table_base_ins_func();
+
+ CREATE TRIGGER transition_table_base_ins_trig
+ AFTER INSERT ON transition_table_base
+ REFERENCING NEW TABLE AS newtable
+ FOR EACH STATEMENT
+ EXECUTE PROCEDURE transition_table_base_ins_func();
+
+ INSERT INTO transition_table_base VALUES (1, 'One'), (2, 'Two');
+ INSERT INTO transition_table_base VALUES (3, 'Three'), (4, 'Four');
+
+ CREATE OR REPLACE FUNCTION transition_table_base_upd_func()
+ RETURNS trigger
+ LANGUAGE plpgsql
+ AS $$
+ DECLARE
+ t text;
+ l text;
+ BEGIN
+ t = '';
+ FOR l IN EXECUTE
+ $q$
+ EXPLAIN (TIMING off, COSTS off, VERBOSE on)
+ SELECT * FROM oldtable ot FULL JOIN newtable nt USING (id)
+ $q$ LOOP
+ t = t || l || E'\n';
+ END LOOP;
+
+ RAISE INFO '%', t;
+ RETURN new;
+ END;
+ $$;
+
+ CREATE TRIGGER transition_table_base_upd_trig
+ AFTER UPDATE ON transition_table_base
+ REFERENCING OLD TABLE AS oldtable NEW TABLE AS newtable
+ FOR EACH STATEMENT
+ EXECUTE PROCEDURE transition_table_base_upd_func();
+
+ UPDATE transition_table_base
+ SET val = '*' || val || '*'
+ WHERE id BETWEEN 2 AND 3;
+
+ CREATE TABLE transition_table_level1
+ (
+ level1_no serial NOT NULL ,
+ level1_node_name varchar(255),
+ PRIMARY KEY (level1_no)
+ ) WITHOUT OIDS;
+
+ CREATE TABLE transition_table_level2
+ (
+ level2_no serial NOT NULL ,
+ parent_no int NOT NULL,
+ level1_node_name varchar(255),
+ PRIMARY KEY (level2_no)
+ ) WITHOUT OIDS;
+
+ CREATE TABLE transition_table_status
+ (
+ level int NOT NULL,
+ node_no int NOT NULL,
+ status int,
+ PRIMARY KEY (level, node_no)
+ ) WITHOUT OIDS;
+
+ CREATE FUNCTION transition_table_level1_ri_parent_del_func()
+ RETURNS TRIGGER
+ LANGUAGE plpgsql
+ AS $$
+ DECLARE n bigint;
+ BEGIN
+ PERFORM FROM p JOIN transition_table_level2 c ON c.parent_no = p.level1_no;
+ IF FOUND THEN
+ RAISE EXCEPTION 'RI error';
+ END IF;
+ RETURN NULL;
+ END;
+ $$;
+
+ CREATE TRIGGER transition_table_level1_ri_parent_del_trigger
+ AFTER DELETE ON transition_table_level1
+ REFERENCING OLD TABLE AS p
+ FOR EACH STATEMENT EXECUTE PROCEDURE
+ transition_table_level1_ri_parent_del_func();
+
+ CREATE FUNCTION transition_table_level1_ri_parent_upd_func()
+ RETURNS TRIGGER
+ LANGUAGE plpgsql
+ AS $$
+ DECLARE
+ x int;
+ BEGIN
+ WITH p AS (SELECT level1_no, sum(delta) cnt
+ FROM (SELECT level1_no, 1 AS delta FROM i
+ UNION ALL
+ SELECT level1_no, -1 AS delta FROM d) w
+ GROUP BY level1_no
+ HAVING sum(delta) < 0)
+ SELECT level1_no
+ FROM p JOIN transition_table_level2 c ON c.parent_no = p.level1_no
+ INTO x;
+ IF FOUND THEN
+ RAISE EXCEPTION 'RI error';
+ END IF;
+ RETURN NULL;
+ END;
+ $$;
+
+ CREATE TRIGGER transition_table_level1_ri_parent_upd_trigger
+ AFTER UPDATE ON transition_table_level1
+ REFERENCING OLD TABLE AS d NEW TABLE AS i
+ FOR EACH STATEMENT EXECUTE PROCEDURE
+ transition_table_level1_ri_parent_upd_func();
+
+ CREATE FUNCTION transition_table_level2_ri_child_insupd_func()
+ RETURNS TRIGGER
+ LANGUAGE plpgsql
+ AS $$
+ BEGIN
+ PERFORM FROM i
+ LEFT JOIN transition_table_level1 p
+ ON p.level1_no IS NOT NULL AND p.level1_no = i.parent_no
+ WHERE p.level1_no IS NULL;
+ IF FOUND THEN
+ RAISE EXCEPTION 'RI error';
+ END IF;
+ RETURN NULL;
+ END;
+ $$;
+
+ CREATE TRIGGER transition_table_level2_ri_child_insupd_trigger
+ AFTER INSERT OR UPDATE ON transition_table_level2
+ REFERENCING NEW TABLE AS i
+ FOR EACH STATEMENT EXECUTE PROCEDURE
+ transition_table_level2_ri_child_insupd_func();
+
+ -- create initial test data
+ INSERT INTO transition_table_level1 (level1_no)
+ SELECT generate_series(1,200);
+ ANALYZE transition_table_level1;
+
+ INSERT INTO transition_table_level2 (level2_no, parent_no)
+ SELECT level2_no, level2_no / 50 + 1 AS parent_no
+ FROM generate_series(1,9999) level2_no;
+ ANALYZE transition_table_level2;
+
+ INSERT INTO transition_table_status (level, node_no, status)
+ SELECT 1, level1_no, 0 FROM transition_table_level1;
+
+ INSERT INTO transition_table_status (level, node_no, status)
+ SELECT 2, level2_no, 0 FROM transition_table_level2;
+ ANALYZE transition_table_status;
+
+ INSERT INTO transition_table_level1(level1_no)
+ SELECT generate_series(201,1000);
+ ANALYZE transition_table_level1;
+
+ -- behave reasonably if someone tries to modify a transition table
+ CREATE FUNCTION transition_table_level2_bad_usage_func()
+ RETURNS TRIGGER
+ LANGUAGE plpgsql
+ AS $$
+ BEGIN
+ INSERT INTO d VALUES (1000000, 1000000, 'x');
+ RETURN NULL;
+ END;
+ $$;
+
+ CREATE TRIGGER transition_table_level2_bad_usage_trigger
+ AFTER DELETE ON transition_table_level2
+ REFERENCING OLD TABLE AS d
+ FOR EACH STATEMENT EXECUTE PROCEDURE
+ transition_table_level2_bad_usage_func();
+
+ DELETE FROM transition_table_level2
+ WHERE level2_no BETWEEN 301 AND 305;
+
+ DROP TRIGGER transition_table_level2_bad_usage_trigger
+ ON transition_table_level2;
+
+ -- attempt modifications which would break RI (should all fail)
+ DELETE FROM transition_table_level1
+ WHERE level1_no = 25;
+
+ UPDATE transition_table_level1 SET level1_no = -1
+ WHERE level1_no = 30;
+
+ INSERT INTO transition_table_level2 (level2_no, parent_no)
+ VALUES (10000, 10000);
+
+ UPDATE transition_table_level2 SET parent_no = 2000
+ WHERE level2_no = 40;
+
+
+ -- attempt modifications which would not break RI (should all succeed)
+ DELETE FROM transition_table_level1
+ WHERE level1_no BETWEEN 201 AND 1000;
+
+ DELETE FROM transition_table_level1
+ WHERE level1_no BETWEEN 100000000 AND 100000010;
+
+ SELECT count(*) FROM transition_table_level1;
+
+ DELETE FROM transition_table_level2
+ WHERE level2_no BETWEEN 211 AND 220;
+
+ SELECT count(*) FROM transition_table_level2;
+
+ CREATE TABLE alter_table_under_transition_tables
+ (
+ id int PRIMARY KEY,
+ name text
+ );
+
+ CREATE FUNCTION alter_table_under_transition_tables_upd_func()
+ RETURNS TRIGGER
+ LANGUAGE plpgsql
+ AS $$
+ BEGIN
+ RAISE WARNING 'old table = %, new table = %',
+ (SELECT string_agg(id || '=' || name, ',') FROM d),
+ (SELECT string_agg(id || '=' || name, ',') FROM i);
+ RAISE NOTICE 'one = %', (SELECT 1 FROM alter_table_under_transition_tables LIMIT 1);
+ RETURN NULL;
+ END;
+ $$;
+
+ -- should fail, TRUNCATE is not compatible with transition tables
+ CREATE TRIGGER alter_table_under_transition_tables_upd_trigger
+ AFTER TRUNCATE OR UPDATE ON alter_table_under_transition_tables
+ REFERENCING OLD TABLE AS d NEW TABLE AS i
+ FOR EACH STATEMENT EXECUTE PROCEDURE
+ alter_table_under_transition_tables_upd_func();
+
+ -- should work
+ CREATE TRIGGER alter_table_under_transition_tables_upd_trigger
+ AFTER UPDATE ON alter_table_under_transition_tables
+ REFERENCING OLD TABLE AS d NEW TABLE AS i
+ FOR EACH STATEMENT EXECUTE PROCEDURE
+ alter_table_under_transition_tables_upd_func();
+
+ INSERT INTO alter_table_under_transition_tables
+ VALUES (1, '1'), (2, '2'), (3, '3');
+ UPDATE alter_table_under_transition_tables
+ SET name = name || name;
+
+ -- now change 'name' to an integer to see what happens...
+ ALTER TABLE alter_table_under_transition_tables
+ ALTER COLUMN name TYPE int USING name::integer;
+ UPDATE alter_table_under_transition_tables
+ SET name = (name::text || name::text)::integer;
+
+ -- now drop column 'name'
+ ALTER TABLE alter_table_under_transition_tables
+ DROP column name;
+ UPDATE alter_table_under_transition_tables
+ SET id = id;
COPY atest2 FROM stdin; -- ok
bar true
\.
-SELECT * FROM atest1; -- ok
+SELECT * FROM atest1 ORDER BY 1; -- ok
+ -- test leaky-function protections in selfuncs
+
+ -- regress_user1 will own a table and provide a view for it.
+ SET SESSION AUTHORIZATION regress_user1;
+
+ CREATE TABLE atest12 as
+ SELECT x AS a, 10001 - x AS b FROM generate_series(1,10000) x;
+ CREATE INDEX ON atest12 (a);
+ CREATE INDEX ON atest12 (abs(a));
+ VACUUM ANALYZE atest12;
+
+ CREATE FUNCTION leak(integer,integer) RETURNS boolean
+ AS $$begin return $1 < $2; end$$
+ LANGUAGE plpgsql immutable;
+ CREATE OPERATOR <<< (procedure = leak, leftarg = integer, rightarg = integer,
+ restrict = scalarltsel);
+
+ -- view with leaky operator
+ CREATE VIEW atest12v AS
+ SELECT * FROM atest12 WHERE b <<< 5;
+ GRANT SELECT ON atest12v TO PUBLIC;
+
+ -- This plan should use nestloop, knowing that few rows will be selected.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+
+ -- And this one.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y
+ WHERE x.a = y.b and abs(y.a) <<< 5;
+
+ -- Check if regress_user2 can break security.
+ SET SESSION AUTHORIZATION regress_user2;
+
+ CREATE FUNCTION leak2(integer,integer) RETURNS boolean
+ AS $$begin raise notice 'leak % %', $1, $2; return $1 > $2; end$$
+ LANGUAGE plpgsql immutable;
+ CREATE OPERATOR >>> (procedure = leak2, leftarg = integer, rightarg = integer,
+ restrict = scalargtsel);
+
+ -- This should not show any "leak" notices before failing.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 WHERE a >>> 0;
+
+ -- This plan should use hashjoin, as it will expect many rows to be selected.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+
+ -- Now regress_user1 grants sufficient access to regress_user2.
+ SET SESSION AUTHORIZATION regress_user1;
+ GRANT SELECT (a, b) ON atest12 TO PUBLIC;
+ SET SESSION AUTHORIZATION regress_user2;
+
+ -- Now regress_user2 will also get a good row estimate.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
+
+ -- But not for this, due to lack of table-wide permissions needed
+ -- to make use of the expression index's statistics.
+ EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y
+ WHERE x.a = y.b and abs(y.a) <<< 5;
+
+ -- clean up (regress_user1's objects are all dropped later)
+ DROP FUNCTION leak2(integer, integer) CASCADE;
+
+
-- groups
SET SESSION AUTHORIZATION regress_user3;
shoename char(10), -- primary key
sh_avail integer, -- available # of pairs
slcolor char(10), -- preferred shoelace color
- slminlen float, -- miminum shoelace length
+ slminlen float, -- minimum shoelace length
slmaxlen float, -- maximum shoelace length
slunit char(8) -- length unit
-);
+) distribute by roundrobin;
CREATE TABLE shoelace_data (
sl_name char(10), -- primary key
-- test the views defined in CREATE_VIEWS
--
-SELECT * FROM street;
+SELECT * FROM street ORDER BY name,cname,thepath::text;
- SELECT name, #thepath FROM iexit ORDER BY 1, 2;
+ SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2;
SELECT * FROM toyemp WHERE name = 'sharon';
---
--- test creation of SERIAL column
---
-
+SET sequence_range = 1;
- CREATE TABLE serialTest (f1 text, f2 serial);
+ CREATE TABLE serialTest1 (f1 text, f2 serial);
- INSERT INTO serialTest VALUES ('foo');
- INSERT INTO serialTest VALUES ('bar');
- INSERT INTO serialTest VALUES ('force', 100);
- INSERT INTO serialTest VALUES ('wrong', NULL);
+ INSERT INTO serialTest1 VALUES ('foo');
+ INSERT INTO serialTest1 VALUES ('bar');
+ INSERT INTO serialTest1 VALUES ('force', 100);
+ INSERT INTO serialTest1 VALUES ('wrong', NULL);
- SELECT * FROM serialTest ORDER BY f1, f2;
-SELECT * FROM serialTest1;
++SELECT * FROM serialTest1 ORDER BY f1, f2;
-- test smallserial / bigserial
CREATE TABLE serialTest2 (f1 text, f2 serial, f3 smallserial, f4 serial2,
DROP SEQUENCE foo_seq_new;
-- renaming serial sequences
- ALTER TABLE serialtest_f2_seq RENAME TO serialtest_f2_foo;
- INSERT INTO serialTest VALUES ('more');
- SELECT * FROM serialTest ORDER BY f1, f2;
+ ALTER TABLE serialtest1_f2_seq RENAME TO serialtest1_f2_foo;
+ INSERT INTO serialTest1 VALUES ('more');
-SELECT * FROM serialTest1;
++SELECT * FROM serialTest1 ORDER BY f1, f2;
--
-- Check dependencies of serial and ordinary sequences
DROP USER regress_seq_user;
DROP SEQUENCE seq;
+create table test_seqtab (unique1 int, unique2 int);
+insert into test_seqtab select i, i from generate_series(1,1000) s(i);
+
+create temp sequence testseq;
+select distinct(nextval('testseq'))
+ from test_seqtab order by 1 limit 10;
+drop table test_seqtab;
++
+ -- cache tests
+ CREATE SEQUENCE test_seq1 CACHE 10;
+ SELECT nextval('test_seq1');
+ SELECT nextval('test_seq1');
+ SELECT nextval('test_seq1');
+
+ DROP SEQUENCE test_seq1;
select nextval('ts1');
+SELECT setseed(0);
+
+-- DROP TABLE IF EXISTS asd ;
+
+CREATE TABLE IF NOT EXISTS asd AS
+SELECT clientid::numeric(20),
+ (clientid / 20 )::integer::numeric(20) as userid,
+ cts + ((random()* 3600 *24 )||'sec')::interval as cts,
+ (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state,
+ 0 as dim,
+ ((ARRAY['Cat','Dog','Duck'])[(clientid / 10 )% 3 +1 ]) ::text as app_name,
+ ((ARRAY['A','B'])[(clientid / 10 )% 2 +1 ]) ::text as platform
+ FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t
+;
+
+SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid,
+ B.state as state
+FROM ( VALUES
+('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') ,
+('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03')
+) AS D (dates)
+JOIN
+( SELECT DISTINCT clientid FROM asd
+ WHERE userid=74 ) C ON True
+INNER JOIN LATERAL (
+ SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.*
+ FROM asd x
+ INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim ,
+ MAX(p.cts) AS selected_cts
+ FROM asd p
+ where cts<D.dates::timestamp and state in
+ ('A','B')
+ GROUP BY p.clientid,p.app_name,p.platform,p.state,p.dim) y
+ ON y.clientid = x.clientid
+ AND y.selected_cts = x.cts
+ AND y.platform = x.platform
+ AND y.app_name=x.app_name
+ AND y.state=x.state
+ AND y.dim = x.dim
+ and x.clientid = C.clientid
+) B ON True
+ORDER BY dates desc, state;
+
+DROP TABLE asd;
+SELECT setseed(0);
+ --
+ -- Check that volatile quals aren't pushed down past a set-returning function;
+ -- while a nonvolatile qual can be, if it doesn't reference the SRF.
+ --
+ create function tattle(x int, y int) returns bool
+ volatile language plpgsql as $$
+ begin
+ raise notice 'x = %, y = %', x, y;
+ return x > y;
+ end$$;
+
+ explain (verbose, costs off)
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+
+ -- if we pretend it's stable, we get different results:
+ alter function tattle(x int, y int) stable;
+
+ explain (verbose, costs off)
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, 8);
+
+ -- although even a stable qual should not be pushed down if it references SRF
+ explain (verbose, costs off)
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, u);
+
+ select * from
+ (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+ where tattle(x, u);
+
+ drop function tattle(x int, y int);
UPDATE update_test SET a=v.i FROM (VALUES(100, 20)) AS v(i, j)
WHERE update_test.b = v.j;
-SELECT * FROM update_test;
+SELECT * FROM update_test ORDER BY a, b, c;
+ -- fail, wrong data type:
+ UPDATE update_test SET a = v.* FROM (VALUES(100, 20)) AS v(i, j)
+ WHERE update_test.b = v.j;
+
--
-- Test multiple-set-clause syntax
--
UNION ALL
SELECT n || ' bar' FROM t WHERE length(n) < 20
)
-SELECT n, n IS OF (text) AS is_text FROM t;
+SELECT n, n IS OF (text) as is_text FROM t ORDER BY n;
+ -- In a perfect world, this would work and resolve the literal as int ...
+ -- but for now, we have to be content with resolving to text too soon.
+ WITH RECURSIVE t(n) AS (
+ SELECT '7'
+ UNION ALL
+ SELECT n+1 FROM t WHERE n < 10
+ )
+ SELECT n, n IS OF (int) AS is_int FROM t;
+
--
-- Some examples with a tree
--