adminpack \
auth_delay \
auto_explain \
+ bdr \
btree_gin \
btree_gist \
chkpass \
--- /dev/null
+# contrib/worker_spi/Makefile
+
+subdir = contrib/bdr
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+
+%%: all
+
+all:
+ $(MAKE) -f $(top_srcdir)/contrib/bdr/output.mk $(MAKECMDGOALS)
+ $(MAKE) -f $(top_srcdir)/contrib/bdr/worker.mk $(MAKECMDGOALS)
+
+# phony target...
+check: all
+
+.PHONY: all
--- /dev/null
+--\echo Use "CREATE EXTENSION bdr" to load this file. \quit
+
+--CREATE ROLE bdr NOLOGIN SUPERUSER;
+--SET ROLE bdr;
+
+CREATE SCHEMA bdr;
+GRANT USAGE ON SCHEMA bdr TO public;
+
+SET LOCAL search_path = bdr;
+
+CREATE FUNCTION pg_stat_get_bdr(
+ OUT rep_node_id oid,
+ OUT rilocalid oid,
+ OUT riremoteid text,
+ OUT nr_commit int8,
+ OUT nr_rollback int8,
+ OUT nr_insert int8,
+ OUT nr_insert_conflict int8,
+ OUT nr_update int8,
+ OUT nr_update_conflict int8,
+ OUT nr_delete int8,
+ OUT nr_delete_conflict int8,
+ OUT nr_disconnect int8
+)
+RETURNS SETOF record
+LANGUAGE C
+AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION pg_stat_get_bdr() FROM PUBLIC;
+
+CREATE VIEW pg_stat_bdr AS SELECT * FROM pg_stat_get_bdr();
+
+CREATE TABLE bdr_sequence_values
+(
+ owning_sysid text NOT NULL,
+ owning_tlid oid NOT NULL,
+ owning_dboid oid NOT NULL,
+ owning_riname text NOT NULL,
+
+ seqschema text NOT NULL,
+ seqname text NOT NULL,
+ seqrange int8range NOT NULL,
+
+ -- could not acquire chunk
+ failed bool NOT NULL DEFAULT false,
+
+ -- voting successfull
+ confirmed bool NOT NULL,
+
+ -- empty, not referenced
+ emptied bool NOT NULL CHECK(NOT emptied OR confirmed),
+
+ -- used in sequence
+ in_use bool NOT NULL CHECK(NOT in_use OR confirmed),
+
+ EXCLUDE USING gist(seqschema WITH =, seqname WITH =, seqrange WITH &&) WHERE (confirmed),
+ PRIMARY KEY(owning_sysid, owning_tlid, owning_dboid, owning_riname, seqschema, seqname, seqrange)
+);
+SELECT pg_catalog.pg_extension_config_dump('bdr_sequence_values', '');
+
+REVOKE ALL ON TABLE bdr_sequence_values FROM PUBLIC;
+
+CREATE INDEX bdr_sequence_values_chunks ON bdr_sequence_values(seqschema, seqname, seqrange);
+CREATE INDEX bdr_sequence_values_newchunk ON bdr_sequence_values(seqschema, seqname, upper(seqrange));
+
+CREATE TABLE bdr_sequence_elections
+(
+ owning_sysid text NOT NULL,
+ owning_tlid oid NOT NULL,
+ owning_dboid oid NOT NULL,
+ owning_riname text NOT NULL,
+ owning_election_id bigint NOT NULL,
+
+ seqschema text NOT NULL,
+ seqname text NOT NULL,
+ seqrange int8range NOT NULL,
+
+ /* XXX id */
+
+ vote_type text NOT NULL,
+
+ open bool NOT NULL,
+ success bool NOT NULL DEFAULT false,
+
+ PRIMARY KEY(owning_sysid, owning_tlid, owning_dboid, owning_riname, seqschema, seqname, seqrange)
+);
+SELECT pg_catalog.pg_extension_config_dump('bdr_sequence_elections', '');
+REVOKE ALL ON TABLE bdr_sequence_values FROM PUBLIC;
+
+
+CREATE TABLE bdr_votes
+(
+ vote_sysid text NOT NULL,
+ vote_tlid oid NOT NULL,
+ vote_dboid oid NOT NULL,
+ vote_riname text NOT NULL,
+ vote_election_id bigint NOT NULL,
+
+ voter_sysid text NOT NULL,
+ voter_tlid oid NOT NULL,
+ voter_dboid bigint NOT NULL,
+ voter_riname text NOT NULL,
+
+ vote bool NOT NULL,
+ reason text CHECK (reason IS NULL OR vote = false),
+ UNIQUE(vote_sysid, vote_tlid, vote_dboid, vote_riname, vote_election_id, voter_sysid, voter_tlid, voter_dboid, voter_riname)
+);
+SELECT pg_catalog.pg_extension_config_dump('bdr_votes', '');
+REVOKE ALL ON TABLE bdr_votes FROM PUBLIC;
+
+CREATE OR REPLACE FUNCTION bdr_sequence_alloc(INTERNAL)
+ RETURNS INTERNAL
+ LANGUAGE C
+ STABLE STRICT
+AS 'MODULE_PATHNAME'
+;
+
+CREATE OR REPLACE FUNCTION bdr_sequence_setval(INTERNAL)
+ RETURNS INTERNAL
+ LANGUAGE C
+ STABLE STRICT
+AS 'MODULE_PATHNAME'
+;
+
+CREATE OR REPLACE FUNCTION bdr_sequence_options(INTERNAL)
+ RETURNS INTERNAL
+ LANGUAGE C
+ STABLE STRICT
+AS 'MODULE_PATHNAME'
+;
+
+-- not tracked yet, can we trick pg_depend instead?
+DELETE FROM pg_seqam WHERE seqamname = 'bdr';
+
+INSERT INTO pg_seqam(
+ seqamname,
+ seqamalloc,
+ seqamsetval,
+ seqamoptions
+)
+VALUES (
+ 'bdr',
+ 'bdr_sequence_alloc',
+ 'bdr_sequence_setval',
+ 'bdr_sequence_options'
+);
+
+RESET search_path;
--- /dev/null
+/* -------------------------------------------------------------------------
+ *
+ * bdr.c
+ * Replication!!!
+ *
+ * Replication???
+ *
+ * Copyright (C) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/bdr/bdr.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+/* These are always necessary for a bgworker */
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+
+/* these headers are used by this particular worker's code */
+#include "pgstat.h"
+
+#include "access/committs.h"
+#include "access/xact.h"
+#include "catalog/pg_index.h"
+#include "lib/stringinfo.h"
+#include "replication/replication_identifier.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+/* sequencer */
+#include "commands/extension.h"
+
+/* apply */
+#include "libpq-fe.h"
+
+#define MAXCONNINFO 1024
+
+static bool got_sigterm = false;
+ResourceOwner bdr_saved_resowner;
+static char *connections = NULL;
+static char *bdr_synchronous_commit = NULL;
+
+BDRWorkerCon *bdr_apply_con = NULL;
+BDRSequencerCon *bdr_sequencer_con = NULL;
+
+PG_MODULE_MAGIC;
+
+void _PG_init(void);
+
+/*
+ * Converts an int64 to network byte order.
+ */
+static void
+sendint64(int64 i, char *buf)
+{
+ uint32 n32;
+
+ /* High order half first, since we're doing MSB-first */
+ n32 = (uint32) (i >> 32);
+ n32 = htonl(n32);
+ memcpy(&buf[0], &n32, 4);
+
+ /* Now the low order half */
+ n32 = (uint32) i;
+ n32 = htonl(n32);
+ memcpy(&buf[4], &n32, 4);
+}
+
+/*
+ * Converts an int64 from network byte order to native format.
+ *
+ * FIXME: replace with pq_getmsgint64
+ */
+static int64
+recvint64(char *buf)
+{
+ int64 result;
+ uint32 h32;
+ uint32 l32;
+
+ memcpy(&h32, buf, 4);
+ memcpy(&l32, buf + 4, 4);
+ h32 = ntohl(h32);
+ l32 = ntohl(l32);
+
+ result = h32;
+ result <<= 32;
+ result |= l32;
+
+ return result;
+}
+
+/*
+ * Send a Standby Status Update message to server.
+ */
+static bool
+sendFeedback(PGconn *conn, XLogRecPtr blockpos, int64 now, bool replyRequested,
+ bool force)
+{
+ char replybuf[1 + 8 + 8 + 8 + 8 + 1];
+ int len = 0;
+ static XLogRecPtr lastpos = InvalidXLogRecPtr;
+
+ Assert(blockpos != InvalidXLogRecPtr);
+
+ if (!force && (blockpos <= lastpos))
+ return true;
+
+ if (blockpos < lastpos)
+ blockpos = lastpos;
+
+ replybuf[len] = 'r';
+ len += 1;
+ sendint64(blockpos, &replybuf[len]); /* write */
+ len += 8;
+ sendint64(blockpos, &replybuf[len]); /* flush */
+ len += 8;
+ sendint64(blockpos, &replybuf[len]); /* apply */
+ len += 8;
+ sendint64(now, &replybuf[len]); /* sendTime */
+ len += 8;
+ replybuf[len] = replyRequested ? 1 : 0; /* replyRequested */
+ len += 1;
+
+ elog(LOG, "sending feedback (force %d, reply requested %d) to %X/%X",
+ force, replyRequested,
+ (uint32) (blockpos >> 32), (uint32) blockpos);
+
+ lastpos = blockpos;
+
+ if (PQputCopyData(conn, replybuf, len) <= 0 || PQflush(conn))
+ {
+ elog(ERROR, "could not send feedback packet: %s",
+ PQerrorMessage(conn));
+ return false;
+ }
+
+ return true;
+}
+
+static void
+bdr_sigterm(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ got_sigterm = true;
+ if (MyProc)
+ SetLatch(&MyProc->procLatch);
+
+ errno = save_errno;
+}
+
+static void
+bdr_sighup(SIGNAL_ARGS)
+{
+ elog(LOG, "got sighup!");
+ if (MyProc)
+ SetLatch(&MyProc->procLatch);
+}
+
+static void
+process_remote_action(char *data, size_t r)
+{
+ char action;
+
+ action = data[0];
+ data += 1;
+ r--;
+
+ switch (action)
+ {
+ /* BEGIN */
+ case 'B':
+ process_remote_begin(data, r);
+ break;
+ /* COMMIT */
+ case 'C':
+ process_remote_commit(data, r);
+ break;
+ /* INSERT */
+ case 'I':
+ process_remote_insert(data, r);
+ break;
+ /* UPDATE */
+ case 'U':
+ process_remote_update(data, r);
+ break;
+ /* DELETE */
+ case 'D':
+ process_remote_delete(data, r);
+ break;
+ default:
+ elog(ERROR, "unknown action of type %c", action);
+ }
+}
+
+static void
+bdr_apply_main(Datum main_arg)
+{
+ PGconn *streamConn;
+ PGresult *res;
+ int fd;
+ char *remote_sysid;
+ uint64 remote_sysid_i;
+ char *remote_tlid;
+ TimeLineID remote_tlid_i;
+
+#ifdef NOT_USED
+ char *remote_dbname;
+#endif
+ char *remote_dboid;
+ Oid remote_dboid_i;
+ char local_sysid[32];
+ char remote_ident[256];
+ char query[256];
+ char conninfo_repl[MAXCONNINFO + 75];
+ XLogRecPtr last_received = InvalidXLogRecPtr;
+ char *sqlstate;
+ NameData replication_name;
+ RepNodeId replication_identifier;
+ XLogRecPtr start_from;
+ NameData slot_name;
+
+ bdr_apply_con = (BDRWorkerCon *) DatumGetPointer(main_arg);
+
+ NameStr(replication_name)[0] = '\0';
+
+ /* Establish signal handlers before unblocking signals. */
+ pqsignal(SIGHUP, bdr_sighup);
+ pqsignal(SIGTERM, bdr_sigterm);
+
+ /* We're now ready to receive signals */
+ BackgroundWorkerUnblockSignals();
+
+ /* Connect to our database */
+ BackgroundWorkerInitializeConnection(bdr_apply_con->dbname, NULL);
+
+ /* always work in our own schema */
+ SetConfigOption("search_path", "bdr, pg_catalog",
+ PGC_BACKEND, PGC_S_OVERRIDE);
+
+ /* setup synchronous commit according to the user's wishes */
+ if (bdr_synchronous_commit != NULL)
+ SetConfigOption("synchronous_commit", bdr_synchronous_commit,
+ PGC_BACKEND, PGC_S_OVERRIDE); /* other context? */
+
+ CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr apply top-level resource owner");
+ bdr_saved_resowner = CurrentResourceOwner;
+
+ snprintf(conninfo_repl, sizeof(conninfo_repl),
+ "%s replication=database fallback_application_name=bdr",
+ bdr_apply_con->dsn);
+
+ elog(LOG, "%s initialized on %s, remote %s",
+ MyBgworkerEntry->bgw_name, bdr_apply_con->dbname, conninfo_repl);
+
+ streamConn = PQconnectdb(conninfo_repl);
+ if (PQstatus(streamConn) != CONNECTION_OK)
+ {
+ ereport(FATAL,
+ (errmsg("could not connect to the primary server: %s",
+ PQerrorMessage(streamConn))));
+ }
+
+
+ res = PQexec(streamConn, "IDENTIFY_SYSTEM");
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(FATAL, "could not send replication command \"%s\": %s",
+ "IDENTIFY_SYSTEM", PQerrorMessage(streamConn));
+ }
+ if (PQntuples(res) != 1 || PQnfields(res) != 5)
+ {
+ elog(FATAL, "could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n",
+ PQntuples(res), PQnfields(res), 1, 5);
+ }
+
+ remote_sysid = PQgetvalue(res, 0, 0);
+ remote_tlid = PQgetvalue(res, 0, 1);
+#ifdef NOT_USED
+ remote_dbname = PQgetvalue(res, 0, 3);
+#endif
+ remote_dboid = PQgetvalue(res, 0, 4);
+
+ if (sscanf(remote_sysid, UINT64_FORMAT, &remote_sysid_i) != 1)
+ elog(ERROR, "could not parse remote sysid %s", remote_sysid);
+
+ if (sscanf(remote_tlid, "%u", &remote_tlid_i) != 1)
+ elog(ERROR, "could not parse remote tlid %s", remote_tlid);
+
+ if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
+ elog(ERROR, "could not parse remote database OID %s", remote_dboid);
+
+ snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+ GetSystemIdentifier());
+
+ if (strcmp(remote_sysid, local_sysid) == 0)
+ {
+ ereport(FATAL,
+ (errmsg("system identifiers must differ between the nodes"),
+ errdetail("Both system identifiers are %s.", remote_sysid)));
+ }
+ else
+ elog(LOG, "local sysid %s, remote: %s",
+ local_sysid, remote_sysid);
+
+ /*
+ * build slot name.
+ *
+ * FIXME: This might truncate the identifier if replication_name is
+ * somewhat longer...
+ */
+ snprintf(NameStr(slot_name), NAMEDATALEN, "bdr_%u_%s_%u_%u__%s",
+ remote_dboid_i, local_sysid, ThisTimeLineID,
+ MyDatabaseId, NameStr(replication_name));
+ NameStr(slot_name)[NAMEDATALEN - 1] = '\0';
+
+ /*
+ * Build replication identifier.
+ */
+ snprintf(remote_ident, sizeof(remote_ident), "bdr_"UINT64_FORMAT"_%u_%u_%u_%s",
+ remote_sysid_i, remote_tlid_i, remote_dboid_i, MyDatabaseId, NameStr(replication_name));
+
+ StartTransactionCommand();
+
+ replication_identifier = GetReplicationIdentifier(remote_ident, true);
+
+ CommitTransactionCommand();
+
+ /* no parts of IDENTIFY_SYSTEM's response needed anymore */
+ PQclear(res);
+
+ if (OidIsValid(replication_identifier))
+ elog(LOG, "found valid replication identifier %u", replication_identifier);
+ /* create local replication identifier and a remote slot */
+ else
+ {
+ elog(LOG, "lookup failed, create new identifier");
+ /* doing this really safely would require 2pc... */
+ StartTransactionCommand();
+
+ /* we want the new identifier on stable storage immediately */
+ ForceSyncCommit();
+
+ /* acquire remote decoding slot */
+ snprintf(query, sizeof(query), "CREATE_REPLICATION_SLOT \"%s\" LOGICAL %s",
+ NameStr(slot_name), "bdr_output");
+ res = PQexec(streamConn, query);
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(FATAL, "could not send replication command \"%s\": status %s: %s\n",
+ query, PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+ PQclear(res);
+
+ /* acquire new local identifier, but don't commit */
+ replication_identifier = CreateReplicationIdentifier(remote_ident);
+
+ /* now commit local identifier */
+ CommitTransactionCommand();
+ CurrentResourceOwner = bdr_saved_resowner;
+ elog(LOG, "created replication identifier %u", replication_identifier);
+
+ /*
+ * FIXME: set current replication progress from upstream IFF it has not
+ * been set yet. Another worker might have cloned from upstream and set
+ * the progress for all nodes.
+ */
+ }
+
+ bdr_apply_con->origin_id = replication_identifier;
+ bdr_apply_con->sysid = remote_sysid_i;
+ bdr_apply_con->timeline = remote_tlid_i;
+
+ /* initialize stat subsystem, our id won't change further */
+ bdr_count_set_current_node(replication_identifier);
+
+ /*
+ * tell replication_identifier.c about our identifier so it can cache the
+ * search in shared memory.
+ */
+ SetupCachedReplicationIdentifier(replication_identifier);
+
+ /*
+ * Check whether we already replayed something so we don't replay it
+ * multiple times.
+ */
+ start_from = RemoteCommitFromCachedReplicationIdentifier();
+
+ elog(LOG, "starting up replication at %u from %X/%X",
+ replication_identifier,
+ (uint32) (start_from >> 32), (uint32) start_from);
+
+ snprintf(query, sizeof(query), "START_REPLICATION SLOT \"%s\" LOGICAL %X/%X",
+ NameStr(slot_name), (uint32) (start_from >> 32), (uint32) start_from);
+ res = PQexec(streamConn, query);
+
+ sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+
+ if (PQresultStatus(res) != PGRES_COPY_BOTH)
+ {
+ elog(FATAL, "could not send replication command \"%s\": %s\n, sqlstate: %s",
+ query, PQresultErrorMessage(res), sqlstate);
+ }
+ PQclear(res);
+
+ fd = PQsocket(streamConn);
+
+ replication_origin_id = replication_identifier;
+
+ while (!got_sigterm)
+ {
+ /* int ret; */
+ int rc;
+ int r;
+ char *copybuf = NULL;
+
+ /*
+ * Background workers mustn't call usleep() or any direct equivalent:
+ * instead, they may wait on their process latch, which sleeps as
+ * necessary, but is awakened if postmaster dies. That way the
+ * background process goes away immediately in an emergency.
+ */
+ rc = WaitLatchOrSocket(&MyProc->procLatch,
+ WL_SOCKET_READABLE | WL_LATCH_SET |
+ WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ fd, 1000L);
+
+ ResetLatch(&MyProc->procLatch);
+
+ /* emergency bailout if postmaster has died */
+ if (rc & WL_POSTMASTER_DEATH)
+ proc_exit(1);
+
+ if (PQstatus(streamConn) == CONNECTION_BAD)
+ {
+ bdr_count_disconnect();
+ elog(ERROR, "connection to other side has died");
+ }
+
+ if (rc & WL_SOCKET_READABLE)
+ PQconsumeInput(streamConn);
+
+ for (;;)
+ {
+ if (got_sigterm)
+ break;
+
+ r = PQgetCopyData(streamConn, ©buf, 1);
+
+ if (r == -1)
+ {
+ elog(LOG, "data stream ended");
+ return;
+ }
+ else if (r == -2)
+ {
+ elog(ERROR, "could not read COPY data: %s",
+ PQerrorMessage(streamConn));
+ }
+ else if (r < 0)
+ elog(ERROR, "invalid COPY status %d", r);
+ else if (r == 0)
+ {
+ /* need to wait for new data */
+ break;
+ }
+ else
+ {
+ if (copybuf[0] == 'w')
+ {
+ int hdr_len = 0;
+ char *data;
+ XLogRecPtr start_lsn;
+ XLogRecPtr end_lsn;
+
+ hdr_len = 1; /* msgtype 'w' */
+
+ start_lsn = recvint64(©buf[hdr_len]);
+
+ hdr_len += 8; /* dataStart */
+
+ end_lsn = recvint64(©buf[hdr_len]);
+
+ hdr_len += 8; /* walEnd */
+ hdr_len += 8; /* sendTime */
+
+ if (last_received < start_lsn)
+ last_received = start_lsn;
+
+ if (last_received < end_lsn)
+ last_received = end_lsn;
+
+ data = copybuf + hdr_len;
+
+ process_remote_action(data, r);
+ }
+ else if (copybuf[0] == 'k')
+ {
+ XLogRecPtr temp;
+
+ temp = recvint64(©buf[1]);
+
+ sendFeedback(streamConn, temp,
+ GetCurrentTimestamp(), false, true);
+ }
+ /* other message types are purposefully ignored */
+ }
+ }
+
+ /* confirm all writes at once */
+ /*
+ * FIXME: we should only do that after an xlog flush... Yuck.
+ */
+ if (last_received != InvalidXLogRecPtr)
+ sendFeedback(streamConn, last_received,
+ GetCurrentTimestamp(), false, false);
+ }
+
+ proc_exit(0);
+}
+
+static void
+bdr_sequencer_main(Datum main_arg)
+{
+ int rc;
+
+ bdr_sequencer_con = (BDRSequencerCon *) DatumGetPointer(main_arg);
+
+ /* Establish signal handlers before unblocking signals. */
+ pqsignal(SIGHUP, bdr_sighup);
+ pqsignal(SIGTERM, bdr_sigterm);
+
+ /* We're now ready to receive signals */
+ BackgroundWorkerUnblockSignals();
+
+ /* Connect to our database */
+ BackgroundWorkerInitializeConnection(bdr_sequencer_con->dbname, NULL);
+
+ /* always work in our own schema */
+ SetConfigOption("search_path", "bdr, pg_catalog",
+ PGC_BACKEND, PGC_S_OVERRIDE);
+
+ CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr seq top-level resource owner");
+ bdr_saved_resowner = CurrentResourceOwner;
+
+ elog(WARNING, "starting sequencer on db \"%s\"", bdr_sequencer_con->dbname);
+
+ /* make sure BDR extension exists */
+ bdr_sequencer_init();
+
+ while (!got_sigterm)
+ {
+ /*
+ * Background workers mustn't call usleep() or any direct equivalent:
+ * instead, they may wait on their process latch, which sleeps as
+ * necessary, but is awakened if postmaster dies. That way the
+ * background process goes away immediately in an emergency.
+ */
+ rc = WaitLatch(&MyProc->procLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ 10000L);
+
+ ResetLatch(&MyProc->procLatch);
+
+ /* emergency bailout if postmaster has died */
+ if (rc & WL_POSTMASTER_DEATH)
+ proc_exit(1);
+
+ /* check whether we need to vote */
+ bdr_sequencer_vote();
+
+ /* check whether any of our elections needs to be tallied */
+ bdr_sequencer_tally();
+
+ /* check all bdr sequences for used up chunks */
+ bdr_sequencer_fill_sequences();
+
+ /* check whether we need to start new elections */
+ bdr_sequencer_start_elections();
+ pgstat_report_activity(STATE_IDLE, NULL);
+ }
+
+ proc_exit(0);
+}
+
+/*
+ * Entrypoint of this module.
+ */
+void
+_PG_init(void)
+{
+ BackgroundWorker apply_worker;
+ BackgroundWorker sequencer_worker;
+ List *cons;
+ ListCell *c;
+ MemoryContext old_context;
+ Size nregistered = 0;
+
+ char **used_databases;
+ Size num_used_databases = 0;
+
+ size_t off;
+ bool found;
+
+ if (!process_shared_preload_libraries_in_progress)
+ elog(ERROR, "bdr can only be loaded via shared_preload_libraries");
+
+ if (!commit_ts_enabled)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("bdr requires \"track_commit_timestamp\" to be enabled")));
+
+ /* guc's et al need to survive this */
+ old_context = MemoryContextSwitchTo(TopMemoryContext);
+
+ DefineCustomStringVariable("bdr.connections",
+ "List of connections",
+ NULL,
+ &connections,
+ NULL, PGC_POSTMASTER,
+ GUC_LIST_INPUT | GUC_LIST_QUOTE,
+ NULL, NULL, NULL);
+
+ /* XXX: make it changeable at SIGHUP? */
+ DefineCustomStringVariable("bdr.synchronous_commit",
+ "bdr specific synchronous commit value",
+ NULL,
+ &bdr_synchronous_commit,
+ NULL, PGC_POSTMASTER,
+ 0,
+ NULL, NULL, NULL);
+
+ /* if nothing is configured, we're done */
+ if (connections == NULL)
+ goto out;
+
+ if (!SplitIdentifierString(connections, ',', &cons))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"bdr.connections\"")));
+ }
+
+ used_databases = malloc(sizeof(char *) * list_length(cons));
+
+ /* Common apply worker values */
+ apply_worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
+ BGWORKER_BACKEND_DATABASE_CONNECTION;
+ apply_worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ apply_worker.bgw_main = bdr_apply_main;
+ apply_worker.bgw_restart_time = 5;
+
+ /* Common sequence worker values */
+ sequencer_worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
+ BGWORKER_BACKEND_DATABASE_CONNECTION;
+ sequencer_worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ sequencer_worker.bgw_main = bdr_sequencer_main;
+ sequencer_worker.bgw_restart_time = 5;
+
+ foreach(c, cons)
+ {
+ const char *name = (char *) lfirst(c);
+ char *errmsg = NULL;
+ PQconninfoOption *options;
+ PQconninfoOption *cur_option;
+
+ /* don't free, referenced by the guc machinery! */
+ char *optname_dsn = palloc(strlen(name) + 30);
+ char *optname_delay = palloc(strlen(name) + 30);
+ BDRWorkerCon *con;
+
+ found = false;
+
+ con = palloc(sizeof(BDRWorkerCon));
+ con->dsn = (char *) lfirst(c);
+ con->name = pstrdup(name);
+ con->apply_delay = 0;
+
+ sprintf(optname_dsn, "bdr.%s.dsn", name);
+ DefineCustomStringVariable(optname_dsn,
+ optname_dsn,
+ NULL,
+ &con->dsn,
+ NULL, PGC_POSTMASTER,
+ GUC_NOT_IN_SAMPLE,
+ NULL, NULL, NULL);
+
+ sprintf(optname_delay, "bdr.%s.apply_delay", name);
+ DefineCustomIntVariable(optname_delay,
+ optname_delay,
+ NULL,
+ &con->apply_delay,
+ 0, 0, INT_MAX,
+ PGC_SIGHUP,
+ GUC_UNIT_MS,
+ NULL, NULL, NULL);
+
+ if (!con->dsn)
+ {
+ elog(WARNING, "no connection information for %s", name);
+ continue;
+ }
+
+ elog(LOG, "bgworkers, connection: %s", con->dsn);
+
+ options = PQconninfoParse(con->dsn, &errmsg);
+ if (errmsg != NULL)
+ {
+ char *str = pstrdup(errmsg);
+
+ PQfreemem(errmsg);
+ elog(ERROR, "msg: %s", str);
+ }
+
+ cur_option = options;
+ while (cur_option->keyword != NULL)
+ {
+ if (strcmp(cur_option->keyword, "dbname") == 0)
+ {
+ if (cur_option->val == NULL)
+ elog(ERROR, "no dbname set");
+
+ con->dbname = pstrdup(cur_option->val);
+ }
+
+ if (cur_option->val != NULL)
+ {
+ elog(LOG, "option: %s, val: %s",
+ cur_option->keyword, cur_option->val);
+ }
+ cur_option++;
+ }
+
+
+ snprintf(apply_worker.bgw_name, BGW_MAXLEN,
+ "bdr apply: %s", name);
+ apply_worker.bgw_main_arg = PointerGetDatum(con);
+
+ RegisterBackgroundWorker(&apply_worker);
+ nregistered++;
+
+ /* keep track of the databases used */
+ /* check whether we already have a connection in this db */
+ for (off = 0; off < num_used_databases; off++)
+ {
+ if (strcmp(con->dbname , used_databases[off]) == 0)
+ {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ used_databases[num_used_databases++] = pstrdup(con->dbname);
+ }
+
+ /* cleanup */
+ PQconninfoFree(options);
+ }
+
+ Assert(num_used_databases <= nregistered);
+
+ /*
+ * start sequence coordination process if necessary. One process per
+ * database, *not* one per configured connection.
+ */
+ for (off = 0; off < num_used_databases; off++)
+ {
+ const char *name = used_databases[off];
+ BDRSequencerCon *con;
+
+ con = palloc(sizeof(BDRSequencerCon));
+ con->dbname = pstrdup(name);
+ con->num_nodes = nregistered;
+ con->slot = off;
+
+ elog(LOG, "starting seq on %s", name);
+
+ snprintf(sequencer_worker.bgw_name, BGW_MAXLEN,
+ "bdr sequencer: %s", name);
+ sequencer_worker.bgw_main_arg = PointerGetDatum(con);
+
+ RegisterBackgroundWorker(&sequencer_worker);
+
+ }
+
+ EmitWarningsOnPlaceholders("bdr");
+out:
+
+ /*
+ * initialize other modules that need shared memory
+ *
+ * Do so even if we haven't any remote nodes setup, the shared memory
+ * might still be needed for some sql callable functions or such.
+ */
+
+ /* register a slot for every remote node */
+ bdr_count_shmem_init(nregistered);
+ bdr_sequencer_shmem_init(nregistered, num_used_databases);
+
+ MemoryContextSwitchTo(old_context);
+}
--- /dev/null
+shared_preload_libraries = 'bdr'
+wal_level = logical
+max_logical_slots = 4
+track_commit_timestamp = on
--- /dev/null
+# bdr extension
+comment = 'bdr support functions'
+default_version = '0.5'
+module_pathname = '$libdir/bdr'
+relocatable = false
+requires = btree_gist
+schema = pg_catalog
--- /dev/null
+/*
+ * bdr.h
+ *
+ * BiDirectionalReplication
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * contrib/bdr/bdr.h
+ */
+#ifndef BDR_H
+#define BDR_H
+
+#include "replication/logical.h"
+#include "access/xlogdefs.h"
+#include "utils/resowner.h"
+
+
+typedef struct BDRWorkerCon
+{
+ /* name specified in configuration */
+ char *name;
+
+ /* local & remote database name */
+ char *dbname;
+ /* dsn to connect to the remote database */
+ char *dsn;
+ /* how much do we want to delay apply, in ms */
+ int apply_delay;
+
+ RepNodeId origin_id;
+ uint64 sysid;
+ TimeLineID timeline;
+} BDRWorkerCon;
+
+typedef struct BDRSequencerCon
+{
+ /* local database name */
+ char *dbname;
+
+ size_t slot;
+
+ /* yuck */
+ size_t num_nodes;
+} BDRSequencerCon;
+
+extern ResourceOwner bdr_saved_resowner;
+extern BDRWorkerCon *bdr_apply_con;
+extern BDRSequencerCon *bdr_sequencer_con;
+
+/* apply support */
+extern void process_remote_begin(char *data, size_t r);
+extern void process_remote_commit(char *data, size_t r);
+extern void process_remote_insert(char *data, size_t r);
+extern void process_remote_update(char *data, size_t r);
+extern void process_remote_delete(char *data, size_t r);
+
+/* sequence support */
+extern void bdr_sequencer_shmem_init(int nnodes, int sequencers);
+extern void bdr_sequencer_init(void);
+extern void bdr_sequencer_vote(void);
+extern void bdr_sequencer_tally(void);
+extern void bdr_sequencer_start_elections(void);
+extern void bdr_sequencer_fill_sequences(void);
+
+extern void bdr_sequencer_wakeup(void);
+
+extern void bdr_sequence_alloc(PG_FUNCTION_ARGS);
+extern void bdr_sequence_setval(PG_FUNCTION_ARGS);
+extern Datum bdr_sequence_options(PG_FUNCTION_ARGS);
+
+/* statistic functions */
+extern void bdr_count_shmem_init(size_t nnodes);
+extern void bdr_count_set_current_node(RepNodeId node_id);
+extern void bdr_count_commit(void);
+extern void bdr_count_rollback(void);
+extern void bdr_count_insert(void);
+extern void bdr_count_insert_conflict(void);
+extern void bdr_count_update(void);
+extern void bdr_count_update_conflict(void);
+extern void bdr_count_delete(void);
+extern void bdr_count_delete_conflict(void);
+extern void bdr_count_disconnect(void);
+
+
+#endif /* BDR_H */
--- /dev/null
+/* -------------------------------------------------------------------------
+ *
+ * bdr_apply.c
+ * Replication!!!
+ *
+ * Replication???
+ *
+ * Copyright (C) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/bdr/bdr_apply.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+#include "pgstat.h"
+
+#include "access/committs.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "access/xact.h"
+
+#include "catalog/namespace.h"
+#include "catalog/pg_type.h"
+
+#include "executor/spi.h"
+#include "executor/executor.h"
+
+#include "parser/parse_relation.h"
+
+#include "replication/logical.h"
+#include "replication/replication_identifier.h"
+
+#include "storage/bufmgr.h"
+
+#include "utils/builtins.h"
+#include "utils/datetime.h"
+#include "utils/lsyscache.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/tqual.h"
+
+static void build_scan_key(ScanKey skey, Relation rel, Relation idx_rel, HeapTuple key);
+static bool find_pkey_tuple(ScanKey skey, Relation rel, Relation idx_rel, ItemPointer tid, bool lock);
+static void UserTableUpdateIndexes(Relation rel, HeapTuple tuple);
+static char *read_tuple(char *data, size_t len, HeapTuple tuple, Oid *reloid);
+static void tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple);
+
+static void check_sequencer_wakeup(Relation rel);
+
+bool request_sequencer_wakeup = false;
+
+void
+process_remote_begin(char *data, size_t r)
+{
+ XLogRecPtr *origlsn;
+ TimestampTz *committime;
+ TimestampTz current;
+ char statbuf[100];
+ Assert(bdr_apply_con != NULL);
+
+ origlsn = (XLogRecPtr *) data;
+ data += sizeof(XLogRecPtr);
+
+ committime = (TimestampTz *) data;
+ data += sizeof(TimestampTz);
+
+ /* setup state for commit and conflict detection */
+ replication_origin_lsn = *origlsn;
+ replication_origin_timestamp = *committime;
+
+ snprintf(statbuf, sizeof(statbuf),
+ "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %s, %X/%X, %s",
+ bdr_apply_con->name,
+ (uint32) (*origlsn >> 32), (uint32) *origlsn,
+ timestamptz_to_str(*committime));
+
+ elog(LOG, "%s", statbuf);
+
+ pgstat_report_activity(STATE_RUNNING, statbuf);
+
+ /* don't want the overhead otherwise */
+ if (bdr_apply_con->apply_delay > 0)
+ {
+ current = GetCurrentTimestamp();
+#ifndef HAVE_INT64_TIMESTAMP
+#error "we require integer timestamps"
+#endif
+ /* ensure no weirdness due to clock drift */
+ if (current > replication_origin_timestamp)
+ {
+ long sec;
+ int usec;
+
+ current = TimestampTzPlusMilliseconds(current, -bdr_apply_con->apply_delay);
+
+ TimestampDifference(current, replication_origin_timestamp,
+ &sec, &usec);
+ /* FIXME: deal with overflow? */
+ pg_usleep(usec + (sec * USECS_PER_SEC));
+ }
+ }
+
+ request_sequencer_wakeup = false;
+
+ StartTransactionCommand();
+ PushActiveSnapshot(GetTransactionSnapshot());
+}
+
+void
+process_remote_commit(char *data, size_t r)
+{
+ XLogRecPtr *origlsn;
+ XLogRecPtr *end_lsn;
+ TimestampTz *committime;
+
+ origlsn = (XLogRecPtr *) data;
+ data += sizeof(XLogRecPtr);
+
+ end_lsn = (XLogRecPtr *) data;
+ data += sizeof(XLogRecPtr);
+
+ committime = (TimestampTz *) data;
+ data += sizeof(TimestampTz);
+
+ elog(LOG, "COMMIT origin(lsn, end, timestamp): %X/%X, %X/%X, %s",
+ (uint32) (*origlsn >> 32), (uint32) *origlsn,
+ (uint32) (*end_lsn >> 32), (uint32) *end_lsn,
+ timestamptz_to_str(*committime));
+
+ Assert(*origlsn == replication_origin_lsn);
+ Assert(*committime == replication_origin_timestamp);
+
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+
+ pgstat_report_activity(STATE_IDLE, NULL);
+
+ AdvanceCachedReplicationIdentifier(*end_lsn, XactLastCommitEnd);
+
+ CurrentResourceOwner = bdr_saved_resowner;
+
+ bdr_count_commit();
+
+ if (request_sequencer_wakeup)
+ {
+ request_sequencer_wakeup = false;
+ bdr_sequencer_wakeup();
+ }
+}
+
+
+void
+process_remote_insert(char *data, size_t r)
+{
+#ifdef VERBOSE_INSERT
+ StringInfoData s;
+#endif
+ char action;
+ HeapTupleData tup;
+ Oid reloid;
+ Relation rel;
+
+ action = data[0];
+ data++;
+
+ if (action != 'N')
+ elog(ERROR, "expected new tuple but got %c",
+ action);
+
+ data = read_tuple(data, r, &tup, &reloid);
+
+ rel = heap_open(reloid, RowExclusiveLock);
+
+ if (rel->rd_rel->relkind != RELKIND_RELATION)
+ elog(ERROR, "unexpected relkind '%c' rel \"%s\"",
+ rel->rd_rel->relkind, RelationGetRelationName(rel));
+
+ simple_heap_insert(rel, &tup);
+ UserTableUpdateIndexes(rel, &tup);
+ bdr_count_insert();
+
+ check_sequencer_wakeup(rel);
+
+ /* debug output */
+#if VERBOSE_INSERT
+ initStringInfo(&s);
+ tuple_to_stringinfo(&s, RelationGetDescr(rel), &tup);
+ elog(LOG, "INSERT: %s", s.data);
+ resetStringInfo(&s);
+#endif
+
+ heap_close(rel, NoLock);
+}
+
+static void
+fetch_sysid_via_node_id(RepNodeId node_id, uint64 *sysid, TimeLineID *tli)
+{
+ if (node_id == InvalidRepNodeId)
+ {
+ *sysid = GetSystemIdentifier();
+ *tli = ThisTimeLineID;
+ }
+ else
+ {
+ HeapTuple node;
+ Form_pg_replication_identifier node_class;
+ char *ident;
+
+ uint64 remote_sysid;
+ Oid remote_dboid;
+ TimeLineID remote_tli;
+ Oid local_dboid;
+ NameData replication_name;
+
+ node = GetReplicationInfoByIdentifier(node_id, false);
+
+ node_class = (Form_pg_replication_identifier) GETSTRUCT(node);
+
+ ident = text_to_cstring(&node_class->riname);
+
+ if (sscanf(ident, "bdr: "UINT64_FORMAT"-%u-%u-%u:%s",
+ &remote_sysid, &remote_tli, &remote_dboid, &local_dboid, NameStr(replication_name)) != 4)
+ elog(ERROR, "could not parse sysid: %s", ident);
+ ReleaseSysCache(node);
+ pfree(ident);
+ }
+}
+
+void
+process_remote_update(char *data, size_t r)
+{
+ StringInfoData s_key;
+ char action;
+ HeapTupleData old_key;
+ HeapTupleData new_tuple;
+ Oid reloid;
+ Oid idxoid;
+ Oid keyoid = InvalidOid;
+ HeapTuple generated_key = NULL;
+ ItemPointerData oldtid;
+ Relation rel;
+ Relation idxrel;
+ bool found_old;
+ ScanKeyData skey[INDEX_MAX_KEYS];
+ bool primary_key_changed = false;
+
+ action = data[0];
+ data++;
+
+ /* old key present, identifying key changed */
+ if (action == 'K')
+ {
+ data = read_tuple(data, r, &old_key, &keyoid);
+ action = data[0];
+ data++;
+ primary_key_changed = true;;
+ }
+ else if (action != 'N')
+ elog(ERROR, "expected action 'N' or 'K', got %c",
+ action);
+
+ /* check for new tuple */
+ if (action != 'N')
+ elog(ERROR, "expected action 'N', got %c",
+ action);
+
+ /* read new tuple */
+ data = read_tuple(data, r, &new_tuple, &reloid);
+
+ /* collected all data, lookup table definition */
+ rel = heap_open(reloid, RowExclusiveLock);
+
+ if (rel->rd_rel->relkind != RELKIND_RELATION)
+ elog(ERROR, "unexpected relkind '%c' rel \"%s\"",
+ rel->rd_rel->relkind, RelationGetRelationName(rel));
+
+ /*
+ * Check which tuple we want to use for the pkey lookup.
+ */
+ if (primary_key_changed)
+ {
+ if (reloid != keyoid)
+ elog(ERROR, "mismatching key, tuple oids: %u %u", reloid, keyoid);
+ }
+ else
+ {
+ /* key hasn't changed, just use columns from the new tuple */
+ old_key = new_tuple;
+ }
+
+ /* lookup index to build scankey */
+ if (rel->rd_indexvalid == 0)
+ RelationGetIndexList(rel);
+ idxoid = rel->rd_replidindex;
+ if (!OidIsValid(idxoid))
+ {
+ elog(ERROR, "could not find primary key for table with oid %u",
+ RelationGetRelid(rel));
+ return;
+ }
+
+ /* open index, so we can build scan key for row */
+ idxrel = index_open(idxoid, RowExclusiveLock);
+
+ Assert(idxrel->rd_index->indisunique);
+
+ build_scan_key(skey, rel, idxrel, &old_key);
+
+ /* look for tuple identified by the (old) primary key */
+ found_old = find_pkey_tuple(skey, rel, idxrel, &oldtid, true);
+
+ if (found_old)
+ {
+ HeapTupleData oldtuple;
+ Buffer buf;
+ bool found;
+ TransactionId xmin;
+ TimestampTz ts;
+ RepNodeId local_node_id;
+ bool apply_update;
+ bool log_update;
+
+ uint64 local_sysid,
+ remote_sysid;
+ TimeLineID local_tli,
+ remote_tli;
+ CommitExtraData local_node_id_raw;
+
+ ItemPointerCopy(&oldtid, &oldtuple.t_self);
+
+ /* refetch tuple, check for old commit ts & origin */
+ found = heap_fetch(rel, SnapshotAny, &oldtuple, &buf, false, NULL);
+ if (!found)
+ elog(ERROR, "could not refetch tuple %u/%u, relation %u",
+ ItemPointerGetBlockNumber(&oldtid),
+ ItemPointerGetOffsetNumber(&oldtid),
+ RelationGetRelid(rel));
+ xmin = HeapTupleHeaderGetXmin(oldtuple.t_data);
+ ReleaseBuffer(buf);
+
+ /*
+ * We now need to determine whether to keep the original version of the
+ * row, or apply the update we received. We use the last-update-wins
+ * strategy for this, except when the new update comes from the same
+ * node that originated the previous version of the tuple.
+ */
+ TransactionIdGetCommitTsData(xmin, &ts, &local_node_id_raw);
+ local_node_id = local_node_id_raw;
+
+ if (local_node_id == bdr_apply_con->origin_id)
+ {
+ /*
+ * If the row got updated twice within a single node, just apply
+ * the update with no conflict. Don't warn/log either, regardless
+ * of the timing; that's just too common and valid since normal row
+ * level locking guarantees are met.
+ */
+ apply_update = true;
+ log_update = false;
+ }
+ else
+ {
+ int cmp;
+
+ /*
+ * Decide what update wins based on transaction timestamp difference.
+ * The later transaction wins. If the timestamps compare equal,
+ * use sysid + TLI to discern.
+ */
+
+ cmp = timestamptz_cmp_internal(replication_origin_timestamp, ts);
+
+ if (cmp > 0)
+ {
+ apply_update = true;
+ log_update = false;
+ }
+ else if (cmp == 0)
+ {
+ log_update = true;
+
+ fetch_sysid_via_node_id(local_node_id,
+ &local_sysid, &local_tli);
+ fetch_sysid_via_node_id(bdr_apply_con->origin_id,
+ &remote_sysid, &remote_tli);
+
+ if (local_sysid < remote_sysid)
+ apply_update = true;
+ else if (local_sysid > remote_sysid)
+ apply_update = false;
+ else if (local_tli < remote_tli)
+ apply_update = true;
+ else if (local_tli > remote_tli)
+ apply_update = false;
+ else
+ /* shouldn't happen */
+ elog(ERROR, "unsuccessful node comparison");
+ }
+ else
+ {
+ apply_update = false;
+ log_update = true;
+ }
+ }
+
+ if (log_update)
+ {
+ char remote_ts[MAXDATELEN + 1];
+ char local_ts[MAXDATELEN + 1];
+
+ fetch_sysid_via_node_id(local_node_id,
+ &local_sysid, &local_tli);
+ fetch_sysid_via_node_id(bdr_apply_con->origin_id,
+ &remote_sysid, &remote_tli);
+ Assert(remote_sysid == bdr_apply_con->sysid);
+ Assert(remote_tli == bdr_apply_con->timeline);
+
+ memcpy(remote_ts, timestamptz_to_str(replication_origin_timestamp),
+ MAXDATELEN);
+ memcpy(local_ts, timestamptz_to_str(ts),
+ MAXDATELEN);
+
+ initStringInfo(&s_key);
+ tuple_to_stringinfo(&s_key, RelationGetDescr(idxrel), &old_key);
+
+ ereport(LOG,
+ (errcode(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION),
+ errmsg("CONFLICT: %s remote update originating at node " UINT64_FORMAT ":%u at ts %s; row was previously updated at %s node " UINT64_FORMAT ":%u at ts %s. PKEY:%s",
+ apply_update ? "applying" : "skipping",
+ remote_sysid, remote_tli, remote_ts,
+ local_node_id == InvalidRepNodeId ? "local" : "remote",
+ local_sysid, local_tli, local_ts, s_key.data)));
+ resetStringInfo(&s_key);
+ }
+
+ if (apply_update)
+ {
+ simple_heap_update(rel, &oldtid, &new_tuple);
+ /* FIXME: HOT support */
+ UserTableUpdateIndexes(rel, &new_tuple);
+ bdr_count_update();
+ }
+ else
+ bdr_count_update_conflict();
+ }
+ else
+ {
+ initStringInfo(&s_key);
+ tuple_to_stringinfo(&s_key, RelationGetDescr(idxrel), &old_key);
+ bdr_count_update_conflict();
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION),
+ errmsg("CONFLICT: could not find existing tuple for pkey %s", s_key.data)));
+ /* XXX dead code */
+ resetStringInfo(&s_key);
+ goto err;
+ }
+
+err:
+ if (!primary_key_changed && generated_key != NULL)
+ heap_freetuple(generated_key);
+
+ check_sequencer_wakeup(rel);
+
+ /* release locks upon commit */
+ index_close(idxrel, NoLock);
+ heap_close(rel, NoLock);
+}
+
+void
+process_remote_delete(char *data, size_t r)
+{
+#ifdef VERBOSE_DELETE
+ StringInfoData s;
+#endif
+ char action;
+ Oid reloid;
+ Oid idxoid;
+ HeapTupleData old_key;
+ Relation rel;
+ Relation idxrel;
+ ScanKeyData skey[INDEX_MAX_KEYS];
+ bool found_old;
+ ItemPointerData oldtid;
+
+ action = data[0];
+ data++;
+
+ if (action == 'E')
+ {
+ elog(WARNING, "got delete without pkey");
+ return;
+ }
+ else if (action != 'K')
+ elog(ERROR, "expected action K got %c", action);
+
+ data = read_tuple(data, r, &old_key, &reloid);
+
+ /* collected all data, lookup table definition */
+ rel = heap_open(reloid, RowExclusiveLock);
+
+ /* lookup index to build scankey */
+ if (rel->rd_indexvalid == 0)
+ RelationGetIndexList(rel);
+ idxoid = rel->rd_replidindex;
+ if (!OidIsValid(idxoid))
+ {
+ elog(ERROR, "could not find primary key for table with oid %u",
+ RelationGetRelid(rel));
+ return;
+ }
+
+ /* Now open the primary key index */
+ idxrel = index_open(idxoid, RowExclusiveLock);
+
+ if (rel->rd_rel->relkind != RELKIND_RELATION)
+ elog(ERROR, "unexpected relkind '%c' rel \"%s\"",
+ rel->rd_rel->relkind, RelationGetRelationName(rel));
+
+ build_scan_key(skey, rel, idxrel, &old_key);
+
+ /* try to find tuple via a (candidate|primary) key */
+ found_old = find_pkey_tuple(skey, rel, idxrel, &oldtid, true);
+
+ if (found_old)
+ {
+ simple_heap_delete(rel, &oldtid);
+ bdr_count_delete();
+
+ }
+ else
+ {
+ StringInfoData s_key;
+
+ bdr_count_delete_conflict();
+
+ initStringInfo(&s_key);
+ tuple_to_stringinfo(&s_key, RelationGetDescr(idxrel), &old_key);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION),
+ errmsg("CONFLICT: DELETE could not find existing tuple for pkey %s", s_key.data)));
+ resetStringInfo(&s_key);
+ }
+
+#if VERBOSE_DELETE
+ initStringInfo(&s);
+ tuple_to_stringinfo(&s, RelationGetDescr(idxrel), &old_key);
+ elog(LOG, "DELETE old-key: %s", s.data);
+ resetStringInfo(&s);
+#endif
+
+ check_sequencer_wakeup(rel);
+
+ index_close(idxrel, NoLock);
+ heap_close(rel, NoLock);
+}
+
+static void
+check_sequencer_wakeup(Relation rel)
+{
+ if (strcmp(RelationGetRelationName(rel), "bdr_sequence_values") == 0 ||
+ strcmp(RelationGetRelationName(rel), "bdr_sequence_elections") == 0 ||
+ strcmp(RelationGetRelationName(rel), "bdr_votes") == 0)
+ request_sequencer_wakeup = true;
+}
+
+/*
+ * Converts an int64 from network byte order to native format.
+ *
+ * FIXME: replace with pg_getmsgint64
+ */
+static int64
+recvint64(char *buf)
+{
+ int64 result;
+ uint32 h32;
+ uint32 l32;
+
+ memcpy(&h32, buf, 4);
+ memcpy(&l32, buf + 4, 4);
+ h32 = ntohl(h32);
+ l32 = ntohl(l32);
+
+ result = h32;
+ result <<= 32;
+ result |= l32;
+
+ return result;
+}
+
+/*
+ * Read a tuple specification from the given data of the given len, filling
+ * the HeapTuple with it. Also, reloid is set to the OID of the relation
+ * that this tuple is related to. (The passed data contains schema and
+ * relation names; they are resolved to the corresponding local OID.)
+ */
+static char *
+read_tuple(char *data, size_t len, HeapTuple tuple, Oid *reloid)
+{
+ int64 relnamelen;
+ char *relname;
+ Oid relid;
+ int64 nspnamelen;
+ char *nspname;
+ int64 tuplelen;
+ Oid nspoid;
+ char t;
+
+ *reloid = InvalidOid;
+
+ /* FIXME: unaligned data accesses */
+ t = data[0];
+ data += 1;
+ if (t != 'T')
+ elog(ERROR, "expected TUPLE, got %c", t);
+
+ nspnamelen = recvint64(&data[0]);
+ data += 8;
+ nspname = data;
+ data += nspnamelen;
+
+ relnamelen = recvint64(&data[0]);
+ data += 8;
+ relname = data;
+ data += relnamelen;
+
+ tuplelen = recvint64(&data[0]);
+ data += 8;
+
+ tuple->t_data = (HeapTupleHeader) data;
+ tuple->t_len = tuplelen;
+ data += tuplelen;
+
+ /* resolve the names into a relation OID */
+ nspoid = get_namespace_oid(nspname, false);
+ relid = get_relname_relid(relname, nspoid);
+ if (relid == InvalidOid)
+ elog(ERROR, "could not resolve relation name %s.%s", nspname, relname);
+
+ *reloid = relid;
+
+ return data;
+}
+
+/* print the tuple 'tuple' into the StringInfo s */
+static void
+tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple)
+{
+ int natt;
+ Oid oid;
+
+ /* print oid of tuple, it's not included in the TupleDesc */
+ if ((oid = HeapTupleHeaderGetOid(tuple->t_data)) != InvalidOid)
+ {
+ appendStringInfo(s, " oid[oid]:%u", oid);
+ }
+
+ /* print all columns individually */
+ for (natt = 0; natt < tupdesc->natts; natt++)
+ {
+ Form_pg_attribute attr; /* the attribute itself */
+ Oid typid; /* type of current attribute */
+ HeapTuple type_tuple; /* information about a type */
+ Form_pg_type type_form;
+ Oid typoutput; /* output function */
+ bool typisvarlena;
+ Datum origval; /* possibly toasted Datum */
+ Datum val; /* definitely detoasted Datum */
+ char *outputstr = NULL;
+ bool isnull; /* column is null? */
+
+ attr = tupdesc->attrs[natt];
+
+ /*
+ * don't print dropped columns, we can't be sure everything is
+ * available for them
+ */
+ if (attr->attisdropped)
+ continue;
+
+ /*
+ * Don't print system columns
+ */
+ if (attr->attnum < 0)
+ continue;
+
+ typid = attr->atttypid;
+
+ /* gather type name */
+ type_tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
+ if (!HeapTupleIsValid(type_tuple))
+ elog(ERROR, "cache lookup failed for type %u", typid);
+ type_form = (Form_pg_type) GETSTRUCT(type_tuple);
+
+ /* print attribute name */
+ appendStringInfoChar(s, ' ');
+ appendStringInfoString(s, NameStr(attr->attname));
+
+ /* print attribute type */
+ appendStringInfoChar(s, '[');
+ appendStringInfoString(s, NameStr(type_form->typname));
+ appendStringInfoChar(s, ']');
+
+ /* query output function */
+ getTypeOutputInfo(typid,
+ &typoutput, &typisvarlena);
+
+ ReleaseSysCache(type_tuple);
+
+ /* get Datum from tuple */
+ origval = fastgetattr(tuple, natt + 1, tupdesc, &isnull);
+
+ if (isnull)
+ outputstr = "(null)";
+ else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval))
+ outputstr = "(unchanged-toast-datum)";
+ else if (typisvarlena)
+ val = PointerGetDatum(PG_DETOAST_DATUM(origval));
+ else
+ val = origval;
+
+ /* print data */
+ if (outputstr == NULL)
+ outputstr = OidOutputFunctionCall(typoutput, val);
+
+ appendStringInfoChar(s, ':');
+ appendStringInfoString(s, outputstr);
+ }
+}
+
+
+/*
+ * The state object used by CatalogOpenIndexes and friends is actually the
+ * same as the executor's ResultRelInfo, but we give it another type name
+ * to decouple callers from that fact.
+ */
+typedef struct ResultRelInfo *UserTableIndexState;
+
+static void
+UserTableUpdateIndexes(Relation rel, HeapTuple tuple)
+{
+ /* this is largely copied together from copy.c's CopyFrom */
+ EState *estate = CreateExecutorState();
+ ResultRelInfo *resultRelInfo;
+ List *recheckIndexes = NIL;
+ TupleDesc tupleDesc = RelationGetDescr(rel);
+
+ resultRelInfo = makeNode(ResultRelInfo);
+ resultRelInfo->ri_RangeTableIndex = 1; /* dummy */
+ resultRelInfo->ri_RelationDesc = rel;
+ resultRelInfo->ri_TrigInstrument = NULL;
+
+ ExecOpenIndices(resultRelInfo);
+
+ estate->es_result_relations = resultRelInfo;
+ estate->es_num_result_relations = 1;
+ estate->es_result_relation_info = resultRelInfo;
+
+ if (resultRelInfo->ri_NumIndices > 0)
+ {
+ TupleTableSlot *slot = ExecInitExtraTupleSlot(estate);
+
+ ExecSetSlotDescriptor(slot, tupleDesc);
+ ExecStoreTuple(tuple, slot, InvalidBuffer, false);
+
+ recheckIndexes = ExecInsertIndexTuples(slot, &tuple->t_self,
+ estate);
+ }
+
+ ExecResetTupleTable(estate->es_tupleTable, false);
+
+ ExecCloseIndices(resultRelInfo);
+
+ FreeExecutorState(estate);
+ /* FIXME: recheck the indexes */
+ list_free(recheckIndexes);
+}
+
+/*
+ * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that
+ * is setup to match 'rel' (*NOT* idxrel!).
+ */
+static void
+build_scan_key(ScanKey skey, Relation rel, Relation idxrel, HeapTuple key)
+{
+ int attoff;
+ Datum indclassDatum;
+ Datum indkeyDatum;
+ bool isnull;
+ oidvector *opclass;
+ int2vector *indkey;
+
+ indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
+ Anum_pg_index_indclass, &isnull);
+ Assert(!isnull);
+ opclass = (oidvector *) DatumGetPointer(indclassDatum);
+
+ indkeyDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
+ Anum_pg_index_indkey, &isnull);
+ Assert(!isnull);
+ indkey = (int2vector *) DatumGetPointer(indkeyDatum);
+
+
+ for (attoff = 0; attoff < RelationGetNumberOfAttributes(idxrel); attoff++)
+ {
+ Oid operator;
+ Oid opfamily;
+ RegProcedure regop;
+ int pkattno = attoff + 1;
+ int mainattno = indkey->values[attoff];
+ Oid atttype = attnumTypeId(rel, mainattno);
+ Oid optype = get_opclass_input_type(opclass->values[attoff]);
+
+ opfamily = get_opclass_family(opclass->values[attoff]);
+
+ operator = get_opfamily_member(opfamily, optype,
+ optype,
+ BTEqualStrategyNumber);
+
+ if (!OidIsValid(operator))
+ elog(ERROR, "could not lookup equality operator for type %u, optype %u in opfamily %u",
+ atttype, optype, opfamily);
+
+ regop = get_opcode(operator);
+
+ /* FIXME: convert type? */
+ ScanKeyInit(&skey[attoff],
+ pkattno,
+ BTEqualStrategyNumber,
+ regop,
+ fastgetattr(key, mainattno,
+ RelationGetDescr(idxrel), &isnull));
+ if (isnull)
+ elog(ERROR, "index tuple with a null column");
+ }
+}
+
+/*
+ * Search the index 'idxrel' for a tuple identified by 'skey' in 'rel'.
+ *
+ * If a matching tuple is found setup 'tid' to point to it and return true,
+ * false is returned otherwise.
+ */
+static bool
+find_pkey_tuple(ScanKey skey, Relation rel, Relation idxrel,
+ ItemPointer tid, bool lock)
+{
+ HeapTuple tuple;
+ bool found = false;
+ IndexScanDesc scan;
+ Snapshot snap = GetActiveSnapshot();
+
+ /*
+ * XXX: should we use a different snapshot here to be able to get more
+ * information about concurrent activity? For now we use a snapshot
+ * isolation snapshot...
+ */
+
+ scan = index_beginscan(rel, idxrel,
+ snap,
+ RelationGetNumberOfAttributes(idxrel),
+ 0);
+ index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0);
+
+ while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ if (found)
+ elog(ERROR, "WTF, more than one tuple found via pk???");
+ found = true;
+ ItemPointerCopy(&tuple->t_self, tid);
+ }
+
+ index_endscan(scan);
+
+ if (lock && found)
+ {
+ Buffer buf;
+ HeapUpdateFailureData hufd;
+ HTSU_Result res;
+ HeapTupleData locktup;
+ ItemPointerCopy(tid, &locktup.t_self);
+
+ res = heap_lock_tuple(rel, &locktup, snap->curcid, LockTupleExclusive,
+ false /* wait */,
+ false /* don't follow updates */,
+ &buf, &hufd);
+ switch (res)
+ {
+ case HeapTupleMayBeUpdated:
+ break;
+ case HeapTupleUpdated:
+ /* XXX: Improve handling here */
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("concurrent update, retrying")));
+ default:
+ elog(ERROR, "unexpected HTSU_Result after locking: %u", res);
+ break;
+ }
+ ReleaseBuffer(buf);
+ }
+ return found;
+}
--- /dev/null
+/* -------------------------------------------------------------------------
+ *
+ * bdr_count.c
+ * Replication replication stats
+ *
+ * Copyright (C) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/bdr/bdr_count.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "bdr.h"
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "nodes/execnodes.h"
+
+#include "replication/replication_identifier.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/spin.h"
+#include "storage/lwlock.h"
+
+#include "utils/builtins.h"
+#include "utils/syscache.h"
+
+/*
+ * Statistics about logical replication
+ *
+ * whenever this struct is changed, bdr_count_version needs to be increased so
+ * on-disk values aren't reused
+ */
+typedef struct BdrCountSlot
+{
+ RepNodeId node_id;
+
+ /* we use int64 to make sure we can export to sql, there is uint64 there */
+ int64 nr_commit;
+ int64 nr_rollback;
+
+ int64 nr_insert;
+ int64 nr_insert_conflict;
+ int64 nr_update;
+ int64 nr_update_conflict;
+ int64 nr_delete;
+ int64 nr_delete_conflict;
+
+ int64 nr_disconnect;
+} BdrCountSlot;
+
+/*
+ * Shared memory header for the stats module.
+ */
+typedef struct BdrCountControl
+{
+ LWLockId lock;
+ BdrCountSlot slots[FLEXIBLE_ARRAY_MEMBER];
+} BdrCountControl;
+
+/*
+ * Header of a stats disk serialization, used to detect old files, changed
+ * parameters and such.
+ */
+typedef struct BdrCountSerialize
+{
+ uint32 magic;
+ uint32 version;
+ uint32 nr_slots;
+} BdrCountSerialize;
+
+/* magic number of the stats file, don't change */
+static const uint32 bdr_count_magic = 0x5e51A7;
+
+/* everytime the stored data format changes, increase */
+static const uint32 bdr_count_version = 2;
+
+/* shortcut for the finding BdrCountControl in memory */
+static BdrCountControl *BdrCountCtl = NULL;
+
+/* how many nodes have we built shmem for */
+static size_t bdr_count_nnodes = 0;
+
+/* offset in the BdrCountControl->slots "our" backend is in */
+static int MyCountOffsetIdx = -1;
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+static void bdr_count_shmem_startup(void);
+static void bdr_count_shmem_shutdown(int code, Datum arg);
+static Size bdr_count_shmem_size(void);
+
+static void bdr_count_serialize(void);
+static void bdr_count_unserialize(void);
+
+#define BDR_COUNT_STAT_COLS 12
+
+extern Datum pg_stat_get_bdr(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(pg_stat_bdr);
+
+static Size
+bdr_count_shmem_size(void)
+{
+ Size size = 0;
+
+ size = add_size(size, sizeof(BdrCountControl));
+ size = add_size(size, mul_size(bdr_count_nnodes, sizeof(BdrCountSlot)));
+
+ return size;
+}
+
+void
+bdr_count_shmem_init(size_t nnodes)
+{
+ Assert(process_shared_preload_libraries_in_progress);
+
+ bdr_count_nnodes = nnodes;
+
+ RequestAddinShmemSpace(bdr_count_shmem_size());
+ /* lock for slot acquiration */
+ RequestAddinLWLocks(1);
+
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = bdr_count_shmem_startup;
+}
+
+static void
+bdr_count_shmem_startup(void)
+{
+ bool found;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+ BdrCountCtl = ShmemInitStruct("bdr_count",
+ bdr_count_shmem_size(),
+ &found);
+ if (!found)
+ {
+ /* initialize */
+
+ memset(BdrCountCtl, 0, bdr_count_shmem_size());
+ BdrCountCtl->lock = LWLockAssign();
+ bdr_count_unserialize();
+ }
+ LWLockRelease(AddinShmemInitLock);
+
+ /*
+ * If we're in the postmaster (or a standalone backend...), set up a shmem
+ * exit hook to dump the statistics to disk.
+ */
+ if (!IsUnderPostmaster)
+ on_shmem_exit(bdr_count_shmem_shutdown, (Datum) 0);
+}
+
+static void
+bdr_count_shmem_shutdown(int code, Datum arg)
+{
+ /*
+ * To avoid doing the same everywhere, we only write in postmaster itself
+ * (or in a single node postgres)
+ */
+ if (IsUnderPostmaster)
+ return;
+
+ /* persist the file */
+ bdr_count_serialize();
+}
+
+/*
+ * Find a statistics slot for a given RepNodeId and setup a local variable
+ * pointing to it so we can quickly find it for the actual statistics
+ * manipulation.
+ */
+void
+bdr_count_set_current_node(RepNodeId node_id)
+{
+ size_t i;
+
+ MyCountOffsetIdx = -1;
+
+ LWLockAcquire(BdrCountCtl->lock, LW_EXCLUSIVE);
+
+ /* check whether stats already are counted for this node */
+ for (i = 0; i < bdr_count_nnodes; i++)
+ {
+ if (BdrCountCtl->slots[i].node_id == node_id)
+ {
+ MyCountOffsetIdx = i;
+ break;
+ }
+ }
+
+ if (MyCountOffsetIdx != -1)
+ goto out;
+
+ /* ok, get a new slot */
+ for (i = 0; i < bdr_count_nnodes; i++)
+ {
+ if (BdrCountCtl->slots[i].node_id == InvalidRepNodeId)
+ {
+ MyCountOffsetIdx = i;
+ BdrCountCtl->slots[i].node_id = node_id;
+ break;
+ }
+ }
+
+ if (MyCountOffsetIdx == -1)
+ elog(PANIC, "could not find a bdr count slot for %u", node_id);
+out:
+ LWLockRelease(BdrCountCtl->lock);
+}
+
+/*
+ * Statistic manipulation functions.
+ *
+ * We assume we don't have to do any locking for *our* slot since only one
+ * backend will do writing there.
+ */
+void
+bdr_count_commit(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_commit++;
+}
+
+void
+bdr_count_rollback(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_rollback++;
+}
+
+void
+bdr_count_insert(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_insert++;
+}
+
+void
+bdr_count_insert_conflict(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_insert_conflict++;
+}
+
+void
+bdr_count_update(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_update++;
+}
+
+void
+bdr_count_update_conflict(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_update_conflict++;
+}
+
+void
+bdr_count_delete(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_delete++;
+}
+
+void
+bdr_count_delete_conflict(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_delete_conflict++;
+}
+
+void
+bdr_count_disconnect(void)
+{
+ Assert(MyCountOffsetIdx != -1);
+ BdrCountCtl->slots[MyCountOffsetIdx].nr_disconnect++;
+}
+
+Datum
+pg_stat_get_bdr(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+ int current_offset;
+
+ if (!superuser())
+ elog(ERROR, "blarg");
+
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not allowed in this context")));
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (tupdesc->natts != BDR_COUNT_STAT_COLS)
+ elog(ERROR, "wrong function definition");
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /* don't let a node get created/vanish below us */
+ LWLockAcquire(BdrCountCtl->lock, LW_SHARED);
+
+ for (current_offset = 0; current_offset < bdr_count_nnodes;
+ current_offset++)
+ {
+ HeapTuple repTup;
+ Form_pg_replication_identifier repClass;
+ BdrCountSlot *slot;
+ Datum values[BDR_COUNT_STAT_COLS];
+ bool nulls[BDR_COUNT_STAT_COLS];
+
+ slot = &BdrCountCtl->slots[current_offset];
+
+ /* no stats here */
+ if (slot->node_id == InvalidRepNodeId)
+ continue;
+
+ memset(values, 0, sizeof(values));
+ memset(nulls, 0, sizeof(nulls));
+
+ repTup = GetReplicationInfoByIdentifier(slot->node_id, false);
+
+ repClass = (Form_pg_replication_identifier) GETSTRUCT(repTup);
+
+ values[ 0] = ObjectIdGetDatum(slot->node_id);
+ values[ 1] = ObjectIdGetDatum(repClass->riident);
+ values[ 2] = PointerGetDatum(&repClass->riname);
+ values[ 3] = Int64GetDatumFast(slot->nr_commit);
+ values[ 4] = Int64GetDatumFast(slot->nr_rollback);
+ values[ 5] = Int64GetDatumFast(slot->nr_insert);
+ values[ 6] = Int64GetDatumFast(slot->nr_insert_conflict);
+ values[ 7] = Int64GetDatumFast(slot->nr_update);
+ values[ 8] = Int64GetDatumFast(slot->nr_update_conflict);
+ values[ 9] = Int64GetDatumFast(slot->nr_delete);
+ values[10] = Int64GetDatumFast(slot->nr_delete_conflict);
+ values[11] = Int64GetDatumFast(slot->nr_disconnect);
+
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ ReleaseSysCache(repTup);
+ }
+ LWLockRelease(BdrCountCtl->lock);
+
+ tuplestore_donestoring(tupstore);
+
+ return (Datum) 0;
+}
+
+/*
+ * Write the BDR stats from shared memory to a file
+ */
+static void
+bdr_count_serialize(void)
+{
+ int fd;
+ const char *tpath = "global/bdr.stat.tmp";
+ const char *path = "global/bdr.stat";
+ BdrCountSerialize serial;
+ Size write_size;
+
+ LWLockAcquire(BdrCountCtl->lock, LW_EXCLUSIVE);
+
+ if (unlink(tpath) < 0 && errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not unlink \"%s\": %m", tpath)));
+
+ fd = OpenTransientFile((char *) tpath,
+ O_WRONLY | O_CREAT | O_EXCL | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open \"%s\": %m", tpath)));
+
+ serial.magic = bdr_count_magic;
+ serial.version = bdr_count_version;
+ serial.nr_slots = bdr_count_nnodes;
+
+ /* write header */
+ write_size = sizeof(serial);
+ if ((write(fd, &serial, write_size)) != write_size)
+ {
+ int save_errno = errno;
+
+ CloseTransientFile(fd);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write bdr stat file data \"%s\": %m",
+ tpath)));
+ }
+
+ /* write data */
+ write_size = sizeof(BdrCountSlot) * bdr_count_nnodes;
+ if ((write(fd, &BdrCountCtl->slots, write_size)) != write_size)
+ {
+ int save_errno = errno;
+
+ CloseTransientFile(fd);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write bdr stat file data \"%s\": %m",
+ tpath)));
+ }
+
+ CloseTransientFile(fd);
+
+ /* rename into place */
+ if (rename(tpath, path) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rename bdr stat file \"%s\" to \"%s\": %m",
+ tpath, path)));
+ LWLockRelease(BdrCountCtl->lock);
+}
+
+/*
+ * Load BDR stats from file into shared memory
+ */
+static void
+bdr_count_unserialize(void)
+{
+ int fd;
+ const char *path = "global/bdr.stat";
+ BdrCountSerialize serial;
+ Size read_size;
+
+ if (BdrCountCtl == NULL)
+ elog(ERROR, "cannot use bdr statistics function without loading bdr");
+
+ LWLockAcquire(BdrCountCtl->lock, LW_EXCLUSIVE);
+
+ fd = OpenTransientFile((char *) path,
+ O_RDONLY | PG_BINARY, 0);
+ if (fd < 0 && errno == ENOENT)
+ goto out;
+
+ if (fd < 0)
+ {
+ LWLockRelease(BdrCountCtl->lock);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open bdr stat file \"%s\": %m", path)));
+ }
+
+ read_size = sizeof(serial);
+ if (read(fd, &serial, read_size) != read_size)
+ {
+ int saved_errno = errno;
+ LWLockRelease(BdrCountCtl->lock);
+ CloseTransientFile(fd);
+ errno = saved_errno;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not read bdr stat file data \"%s\": %m",
+ path)));
+ }
+
+ if (serial.magic != bdr_count_magic)
+ {
+ LWLockRelease(BdrCountCtl->lock);
+ CloseTransientFile(fd);
+ elog(ERROR, "expected magic %u doesn't match read magic %u",
+ bdr_count_magic, serial.magic);
+ }
+
+ if (serial.version != bdr_count_version)
+ {
+ elog(WARNING, "version of stat file changed (file %u, current %u), zeroing",
+ serial.version, bdr_count_version);
+ goto zero_file;
+ }
+
+ if (serial.nr_slots > bdr_count_nnodes)
+ {
+ elog(WARNING, "stat file has more stats than we need, zeroing");
+ goto zero_file;
+ }
+
+ /* read actual data, directly into shmem */
+ read_size = sizeof(BdrCountSlot) * serial.nr_slots;
+ if (read(fd, &BdrCountCtl->slots, read_size) != read_size)
+ {
+ int saved_errno = errno;
+ CloseTransientFile(fd);
+ errno = saved_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read bdr stat file data \"%s\": %m",
+ path)));
+ }
+
+out:
+ if (fd >= 0)
+ CloseTransientFile(fd);
+ LWLockRelease(BdrCountCtl->lock);
+ return;
+
+zero_file:
+ CloseTransientFile(fd);
+ LWLockRelease(BdrCountCtl->lock);
+
+ /*
+ * Overwrite the existing file. Note our struct was zeroed in
+ * bdr_count_shmem_startup, so we're writing empty data.
+ */
+ bdr_count_serialize();
+}
--- /dev/null
+CREATE FUNCTION pg_stat_bdr(
+ OUT rep_node_id oid,
+ OUT riremotesysid name,
+ OUT riremotedb oid,
+ OUT rilocaldb oid,
+ OUT nr_commit int8,
+ OUT nr_rollback int8,
+ OUT nr_insert int8,
+ OUT nr_insert_conflict int8,
+ OUT nr_update int8,
+ OUT nr_update_conflict int8,
+ OUT nr_delete int8,
+ OUT nr_delete_conflict int8,
+ OUT nr_disconnect int8
+)
+RETURNS SETOF record
+LANGUAGE C
+AS 'bdr.so'
+
+CREATE VIEW pg_stat_bdr AS SELECT * FROM pg_stat_bdr;
+REVOKE ALL ON FUNCTION pg_stat_bdr() FROM PUBLIC;
+
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * bdr_output.c
+ * BDR output plugin
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/bdr/bdr_output.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/sysattr.h"
+
+#include "catalog/pg_class.h"
+#include "catalog/pg_namespace.h"
+#include "catalog/pg_type.h"
+#include "catalog/index.h"
+
+#include "libpq/pqformat.h"
+
+#include "nodes/parsenodes.h"
+
+#include "replication/output_plugin.h"
+#include "replication/logical.h"
+
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+#include "utils/syscache.h"
+#include "utils/timestamp.h"
+#include "utils/typcache.h"
+
+PG_MODULE_MAGIC;
+
+extern void _PG_init(void);
+extern void _PG_output_plugin_init(OutputPluginCallbacks *cb);
+
+typedef struct
+{
+ MemoryContext context;
+ bool include_xids;
+} TestDecodingData;
+
+/* These must be available to pg_dlsym() */
+static void pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt,
+ bool is_init);
+static void pg_decode_begin_txn(LogicalDecodingContext *ctx,
+ ReorderBufferTXN *txn);
+static void pg_decode_commit_txn(LogicalDecodingContext *ctx,
+ ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
+static void pg_decode_change(LogicalDecodingContext *ctx,
+ ReorderBufferTXN *txn, Relation rel,
+ ReorderBufferChange *change);
+
+/* private prototypes */
+static void write_tuple(StringInfo out, Relation rel, HeapTuple tuple);
+
+void
+_PG_init(void)
+{
+}
+
+/* specify output plugin callbacks */
+void
+_PG_output_plugin_init(OutputPluginCallbacks *cb)
+{
+ AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit);
+
+ cb->startup_cb = pg_decode_startup;
+ cb->begin_cb = pg_decode_begin_txn;
+ cb->change_cb = pg_decode_change;
+ cb->commit_cb = pg_decode_commit_txn;
+ cb->shutdown_cb = NULL;
+}
+
+
+/* initialize this plugin */
+static void
+pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool is_init)
+{
+ TestDecodingData *data;
+
+ data = palloc(sizeof(TestDecodingData));
+ data->context = AllocSetContextCreate(TopMemoryContext,
+ "bdr conversion context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ ctx->output_plugin_private = data;
+
+ opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT;
+}
+
+/* BEGIN callback */
+void
+pg_decode_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
+{
+#ifdef NOT_YET
+ TestDecodingData *data = ctx->output_plugin_private;
+#endif
+ AssertVariableIsOfType(&pg_decode_begin_txn, LogicalDecodeBeginCB);
+
+ if (txn->origin_id != InvalidRepNodeId)
+ return;
+
+ OutputPluginPrepareWrite(ctx, true);
+ appendStringInfoChar(ctx->out, 'B'); /* BEGIN */
+ appendBinaryStringInfo(ctx->out, (char *) &txn->final_lsn, sizeof(XLogRecPtr));
+ appendBinaryStringInfo(ctx->out, (char *) &txn->commit_time, sizeof(TimestampTz));
+ OutputPluginWrite(ctx, true);
+ return;
+}
+
+/* COMMIT callback */
+void
+pg_decode_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+ XLogRecPtr commit_lsn)
+{
+#ifdef NOT_YET
+ TestDecodingData *data = ctx->output_plugin_private;
+#endif
+
+ if (txn->origin_id != InvalidRepNodeId)
+ return;
+
+ OutputPluginPrepareWrite(ctx, true);
+ appendStringInfoChar(ctx->out, 'C'); /* sending COMMIT */
+ appendBinaryStringInfo(ctx->out, (char *) &commit_lsn, sizeof(XLogRecPtr));
+ appendBinaryStringInfo(ctx->out, (char *) &txn->end_lsn, sizeof(XLogRecPtr));
+ appendBinaryStringInfo(ctx->out, (char *) &txn->commit_time, sizeof(TimestampTz));
+ OutputPluginWrite(ctx, true);
+}
+
+void
+pg_decode_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+ Relation relation, ReorderBufferChange *change)
+{
+ TestDecodingData *data;
+ MemoryContext old;
+
+ data = ctx->output_plugin_private;
+
+ /* Avoid leaking memory by using and resetting our own context */
+ old = MemoryContextSwitchTo(data->context);
+
+ /* only log changes originating locally */
+ if (txn->origin_id != InvalidRepNodeId)
+ return;
+
+ OutputPluginPrepareWrite(ctx, true);
+
+ switch (change->action)
+ {
+ case REORDER_BUFFER_CHANGE_INSERT:
+ appendStringInfoChar(ctx->out, 'I'); /* action INSERT */
+ appendStringInfoChar(ctx->out, 'N'); /* new tuple follows */
+ write_tuple(ctx->out, relation, &change->tp.newtuple->tuple);
+ break;
+ case REORDER_BUFFER_CHANGE_UPDATE:
+ appendStringInfoChar(ctx->out, 'U'); /* action UPDATE */
+ if (change->tp.oldtuple != NULL)
+ {
+ appendStringInfoChar(ctx->out, 'K'); /* old key follows */
+ write_tuple(ctx->out, relation, &change->tp.oldtuple->tuple);
+ }
+ appendStringInfoChar(ctx->out, 'N'); /* new tuple follows */
+ write_tuple(ctx->out, relation, &change->tp.newtuple->tuple);
+ break;
+ case REORDER_BUFFER_CHANGE_DELETE:
+ appendStringInfoChar(ctx->out, 'D'); /* action DELETE */
+ if (change->tp.oldtuple != NULL)
+ {
+ appendStringInfoChar(ctx->out, 'K'); /* old key follows */
+ write_tuple(ctx->out, relation, &change->tp.oldtuple->tuple);
+ }
+ else
+ appendStringInfoChar(ctx->out, 'E'); /* empty */
+ break;
+ }
+ OutputPluginWrite(ctx, true);
+
+ MemoryContextSwitchTo(old);
+ MemoryContextReset(data->context);
+}
+
+static void
+write_tuple(StringInfo out, Relation rel, HeapTuple tuple)
+{
+ HeapTuple cache;
+ Form_pg_namespace classNsp;
+ const char *nspname;
+ int64 nspnamelen;
+ const char *relname;
+ int64 relnamelen;
+
+ cache = SearchSysCache1(NAMESPACEOID,
+ ObjectIdGetDatum(rel->rd_rel->relnamespace));
+ if (!HeapTupleIsValid(cache))
+ elog(ERROR, "cache lookup failed for namespace %u",
+ rel->rd_rel->relnamespace);
+ classNsp = (Form_pg_namespace) GETSTRUCT(cache);
+ nspname = pstrdup(NameStr(classNsp->nspname));
+ nspnamelen = strlen(nspname) + 1;
+ ReleaseSysCache(cache);
+
+ relname = NameStr(rel->rd_rel->relname);
+ relnamelen = strlen(relname) + 1;
+
+ appendStringInfoChar(out, 'T'); /* tuple follows */
+
+ pq_sendint64(out, nspnamelen); /* schema name length */
+ appendBinaryStringInfo(out, nspname, nspnamelen);
+
+ pq_sendint64(out, relnamelen); /* table name length */
+ appendBinaryStringInfo(out, relname, relnamelen);
+
+ pq_sendint64(out, tuple->t_len); /* tuple length */
+ appendBinaryStringInfo(out, (char *) tuple->t_data, tuple->t_len);
+}
--- /dev/null
+/* -------------------------------------------------------------------------
+ *
+ * bdr.c
+ * Replication!!!
+ *
+ * Replication???
+ *
+ * Copyright (C) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/bdr/bdr.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+/* sequencer */
+#include "miscadmin.h"
+#include "pgstat.h"
+
+#include "access/reloptions.h"
+#include "access/transam.h"
+#include "access/seqam.h"
+#include "access/xact.h"
+#include "catalog/pg_type.h"
+#include "catalog/namespace.h"
+#include "commands/extension.h"
+#include "commands/sequence.h"
+#include "executor/spi.h"
+#include "utils/builtins.h"
+#include "utils/snapmgr.h"
+#include "utils/lsyscache.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+
+typedef struct BdrSequencerSlot
+{
+ Oid database_oid;
+ Latch *proclatch;
+} BdrSequencerSlot;
+
+typedef struct BdrSequencerControl
+{
+ size_t nnodes;
+ size_t slot;
+ BdrSequencerSlot slots[FLEXIBLE_ARRAY_MEMBER];
+} BdrSequencerControl;
+
+typedef struct BdrSequenceValues {
+ int64 start_value;
+ int64 next_value;
+ int64 end_value;
+} BdrSequenceValues;
+
+static BdrSequencerControl *BdrSequencerCtl = NULL;
+
+/* how many nodes have we built shmem for */
+static size_t bdr_seq_nnodes = 0;
+
+/* how many nodes have we built shmem for */
+static size_t bdr_seq_nsequencers = 0;
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+/* vote */
+const char* vote_sql =
+"INSERT INTO bdr_votes (\n"
+" vote_sysid,\n"
+" vote_tlid,\n"
+" vote_dboid,\n"
+" vote_riname,\n"
+" vote_election_id,\n"
+"\n"
+" voter_sysid,\n"
+" voter_tlid,\n"
+" voter_dboid,\n"
+" voter_riname,\n"
+"\n"
+" vote\n"
+")\n"
+"SELECT\n"
+" owning_sysid, owning_tlid, owning_dboid, owning_riname, owning_election_id,\n"
+" $1, $2, $3, $4,\n"
+" -- haven't allocated the value ourselves\n"
+" NOT EXISTS(\n"
+" SELECT *\n"
+" FROM bdr_sequence_values val\n"
+" WHERE true\n"
+" AND val.seqschema = election.seqschema\n"
+" AND val.seqname = election.seqname\n"
+" AND val.seqrange && election.seqrange\n"
+" --AND NOT val.confirmed\n"
+" AND val.owning_sysid = $1\n"
+" AND val.owning_tlid = $2\n"
+" AND val.owning_dboid = $3\n"
+" AND val.owning_riname = $4\n"
+" )\n"
+" -- and we haven't allowed anybody else to use it\n"
+" AND NOT EXISTS(\n"
+" SELECT *\n"
+" FROM bdr_votes vote\n"
+" JOIN bdr_sequence_elections other_election ON (\n"
+" other_election.owning_sysid = vote.vote_sysid\n"
+" AND other_election.owning_tlid = vote.vote_tlid\n"
+" AND other_election.owning_dboid = vote.vote_dboid\n"
+" AND other_election.owning_riname = vote.vote_riname\n"
+" AND other_election.owning_election_id = vote.vote_election_id\n"
+" )\n"
+" WHERE true\n"
+" AND vote.voter_sysid = $1\n"
+" AND vote.voter_tlid = $2\n"
+" AND vote.voter_dboid = $3\n"
+" AND vote.voter_riname = $4\n"
+" AND other_election.seqname = election.seqname\n"
+" AND other_election.seqschema = election.seqschema\n"
+" AND other_election.seqrange && election.seqrange \n"
+" )\n"
+"\n"
+"FROM bdr_sequence_elections election\n"
+"WHERE\n"
+" election.open\n"
+" -- not our election\n"
+" AND NOT (\n"
+" owning_sysid = $1\n"
+" AND owning_tlid = $2\n"
+" AND owning_dboid = $3\n"
+" AND owning_riname = $4\n"
+" )\n"
+" -- we haven't voted about this yet\n"
+" AND NOT EXISTS (\n"
+" SELECT *\n"
+" FROM bdr_votes\n"
+" WHERE true\n"
+" AND owning_sysid = vote_sysid\n"
+" AND owning_tlid = vote_tlid\n"
+" AND owning_dboid = vote_dboid\n"
+" AND owning_riname = vote_riname\n"
+" AND owning_election_id = vote_election_id\n"
+"\n"
+" AND voter_sysid = $1\n"
+" AND voter_tlid = $2\n"
+" AND voter_dboid = $3\n"
+" AND voter_riname = $4\n"
+" )\n"
+"LIMIT 1\n"
+";";
+
+const char *start_elections_sql =
+"WITH to_be_updated_sequences AS (\n"
+" SELECT\n"
+" pg_namespace.nspname AS seqschema,\n"
+" pg_class.relname AS seqname,\n"
+" COUNT(bdr_sequence_values) AS open_seq_chunks,\n"
+" COALESCE((\n"
+" SELECT max(upper(seqrange))\n"
+" FROM bdr_sequence_values max_val\n"
+" WHERE\n"
+" max_val.seqschema = pg_namespace.nspname\n"
+" AND max_val.seqname = pg_class.relname\n"
+" ), 0) AS current_max\n"
+" FROM\n"
+" pg_class\n"
+" JOIN pg_namespace ON (pg_class.relnamespace = pg_namespace.oid)\n"
+" LEFT JOIN bdr_sequence_values ON (\n"
+" bdr_sequence_values.seqschema = pg_namespace.nspname\n"
+" AND bdr_sequence_values.seqname = pg_class.relname\n"
+" AND bdr_sequence_values.emptied = false\n"
+" AND bdr_sequence_values.in_use = false\n"
+" AND bdr_sequence_values.failed = false\n"
+" AND bdr_sequence_values.owning_sysid = $1\n"
+" AND bdr_sequence_values.owning_tlid = $2\n"
+" AND bdr_sequence_values.owning_dboid = $3\n"
+" AND bdr_sequence_values.owning_riname = $4\n"
+" )\n"
+" WHERE\n"
+" pg_class.relkind = 'S'\n"
+" AND pg_class.relam = (SELECT oid FROM pg_seqam WHERE seqamname = 'bdr')\n"
+" GROUP BY\n"
+" pg_class.relname,\n"
+" pg_namespace.nspname\n"
+" HAVING\n"
+" count(bdr_sequence_values) <= 5\n"
+"),\n"
+"to_be_inserted_chunks AS (\n"
+" SELECT\n"
+" seqschema,\n"
+" seqname,\n"
+" current_max,\n"
+" generate_series(\n"
+" current_max,\n"
+" -- 1000 is the chunk size, -1 is to get < instead <= out of generate_series\n"
+" current_max + 1000 * (5 - open_seq_chunks) - 1,\n"
+" 1000) chunk_start\n"
+" FROM to_be_updated_sequences\n"
+"),\n"
+"inserted_chunks AS (\n"
+" INSERT INTO bdr_sequence_elections(\n"
+" owning_sysid,\n"
+" owning_tlid,\n"
+" owning_dboid,\n"
+" owning_riname,\n"
+" owning_election_id,\n"
+" vote_type,\n"
+" open,\n"
+" seqschema,\n"
+" seqname,\n"
+" seqrange\n"
+" )\n"
+" SELECT\n"
+" $1,\n"
+" $2,\n"
+" $3,\n"
+" $4,\n"
+" (\n"
+" SELECT COALESCE(max(owning_election_id), 0)\n"
+" FROM bdr_sequence_elections biggest\n"
+" WHERE\n"
+" biggest.owning_sysid = $1\n"
+" AND biggest.owning_tlid = $2\n"
+" AND biggest.owning_dboid = $3\n"
+" AND biggest.owning_riname = $4\n"
+" ) + row_number() OVER (),\n"
+" 'sequence',\n"
+" true AS open,\n"
+" seqschema,\n"
+" seqname,\n"
+" int8range(chunk_start, chunk_start + 1000) AS seqrange\n"
+" FROM to_be_inserted_chunks\n"
+" RETURNING\n"
+" seqschema,\n"
+" seqname,\n"
+" seqrange\n"
+")\n"
+"\n"
+"INSERT INTO bdr_sequence_values(\n"
+" owning_sysid,\n"
+" owning_tlid,\n"
+" owning_dboid,\n"
+" owning_riname,\n"
+" seqschema,\n"
+" seqname,\n"
+" confirmed,\n"
+" in_use,\n"
+" emptied,\n"
+" seqrange\n"
+")\n"
+"SELECT\n"
+" $1,\n"
+" $2,\n"
+" $3,\n"
+" $4,\n"
+" seqschema,\n"
+" seqname,\n"
+" false AS confirmed,\n"
+" false AS in_use,\n"
+" false AS emptied,\n"
+" int8range(chunk_start, chunk_start + 1000)\n"
+"FROM to_be_inserted_chunks\n"
+"-- force evaluation \n"
+"WHERE (SELECT count(*) FROM inserted_chunks) >= 0\n"
+"RETURNING\n"
+" owning_sysid,\n"
+" owning_tlid,\n"
+" owning_dboid,\n"
+" owning_riname,\n"
+" seqschema,\n"
+" seqname,\n"
+" confirmed,\n"
+" emptied,\n"
+" seqrange\n"
+;
+
+const char *tally_elections_sql =
+"WITH tallied_votes AS (\n"
+"SELECT\n"
+" election.owning_sysid,\n"
+" election.owning_tlid,\n"
+" election.owning_dboid,\n"
+" election.owning_riname,\n"
+" election.owning_election_id,\n"
+" election.seqschema,\n"
+" election.seqname,\n"
+" election.seqrange,\n"
+" SUM(COALESCE((vote.vote = true)::int, 0)) AS yays,\n"
+" SUM(COALESCE((vote.vote = false)::int, 0)) AS nays,\n"
+" COUNT(vote.vote) AS nr_votes,\n"
+" /* majority of others */\n"
+" COUNT(vote.vote) >= ceil($5/ 2.0) AS sufficient\n"
+"FROM\n"
+" bdr_sequence_elections election\n"
+" LEFT JOIN bdr_votes vote ON (\n"
+" election.owning_sysid = vote.vote_sysid\n"
+" AND election.owning_tlid = vote.vote_tlid\n"
+" AND election.owning_dboid = vote.vote_dboid\n"
+" AND election.owning_riname = vote.vote_riname\n"
+" AND election.owning_election_id = vote.vote_election_id\n"
+" )\n"
+"WHERE\n"
+" election.open\n"
+" AND election.owning_sysid = $1\n"
+" AND election.owning_tlid = $2\n"
+" AND election.owning_dboid = $3\n"
+" AND election.owning_riname = $4\n"
+"GROUP BY\n"
+" election.owning_sysid,\n"
+" election.owning_tlid,\n"
+" election.owning_dboid,\n"
+" election.owning_riname,\n"
+" election.owning_election_id,\n"
+" election.seqschema,\n"
+" election.seqname,\n"
+" election.seqrange\n"
+"),\n"
+"cast_votes AS (\n"
+" UPDATE bdr_sequence_elections\n"
+" SET\n"
+" open = false,\n"
+" success = (nays = 0)\n"
+" FROM tallied_votes\n"
+" WHERE\n"
+" bdr_sequence_elections.owning_sysid = tallied_votes.owning_sysid\n"
+" AND bdr_sequence_elections.owning_tlid = tallied_votes.owning_tlid\n"
+" AND bdr_sequence_elections.owning_dboid = tallied_votes.owning_dboid\n"
+" AND bdr_sequence_elections.owning_riname = tallied_votes.owning_riname\n"
+" AND bdr_sequence_elections.owning_election_id = tallied_votes.owning_election_id\n"
+" AND tallied_votes.sufficient\n"
+" RETURNING bdr_sequence_elections.*\n"
+"),\n"
+"successfull_sequence_values AS (\n"
+" UPDATE bdr_sequence_values\n"
+" SET\n"
+" confirmed = true\n"
+" FROM cast_votes\n"
+" WHERE\n"
+" cast_votes.success = true\n"
+" AND bdr_sequence_values.seqschema = cast_votes.seqschema\n"
+" AND bdr_sequence_values.seqname = cast_votes.seqname\n"
+" AND bdr_sequence_values.seqrange = cast_votes.seqrange\n"
+" AND bdr_sequence_values.owning_sysid = $1\n"
+" AND bdr_sequence_values.owning_tlid = $2\n"
+" AND bdr_sequence_values.owning_dboid = $3\n"
+" AND bdr_sequence_values.owning_riname = $4\n"
+" RETURNING bdr_sequence_values.*\n"
+"),\n"
+"failed_sequence_values AS (\n"
+" UPDATE bdr_sequence_values\n"
+" SET\n"
+" failed = true\n"
+" FROM cast_votes\n"
+" WHERE\n"
+" cast_votes.success = false\n"
+" AND bdr_sequence_values.seqschema = cast_votes.seqschema\n"
+" AND bdr_sequence_values.seqname = cast_votes.seqname\n"
+" AND bdr_sequence_values.seqrange = cast_votes.seqrange\n"
+" AND bdr_sequence_values.owning_sysid = $1\n"
+" AND bdr_sequence_values.owning_tlid = $2\n"
+" AND bdr_sequence_values.owning_dboid = $3\n"
+" AND bdr_sequence_values.owning_riname = $4\n"
+" RETURNING bdr_sequence_values.*\n"
+")\n"
+"\n"
+"SELECT\n"
+" seqschema,\n"
+" seqname,\n"
+" seqrange,\n"
+" 'success'::text\n"
+"FROM successfull_sequence_values\n"
+"\n"
+"UNION ALL\n"
+"\n"
+"SELECT\n"
+" seqschema,\n"
+" seqname,\n"
+" seqrange,\n"
+" 'failed'::text\n"
+"FROM failed_sequence_values\n"
+"\n"
+"UNION ALL\n"
+"\n"
+"SELECT\n"
+" seqschema,\n"
+" seqname,\n"
+" seqrange,\n"
+" 'pending'::text\n"
+"FROM tallied_votes\n"
+"WHERE NOT sufficient\n"
+;
+
+const char *fill_sequences_sql =
+"SELECT\n"
+" pg_class.oid seqoid,\n"
+" pg_namespace.nspname seqschema,\n"
+" pg_class.relname seqname\n"
+"FROM pg_class\n"
+" JOIN pg_seqam ON (pg_seqam.oid = pg_class.relam)\n"
+" JOIN pg_namespace ON (pg_class.relnamespace = pg_namespace.oid)\n"
+"WHERE\n"
+" relkind = 'S'\n"
+" AND seqamname = 'bdr'\n"
+"ORDER BY pg_class.oid\n"
+;
+
+
+const char *get_chunk_sql =
+"UPDATE bdr_sequence_values\n"
+" SET in_use = true\n"
+"WHERE\n"
+" (\n"
+" owning_sysid,\n"
+" owning_tlid,\n"
+" owning_dboid,\n"
+" owning_riname,\n"
+" seqname,\n"
+" seqschema,\n"
+" seqrange\n"
+" )\n"
+" IN\n"
+" (\n"
+" SELECT\n"
+" newval.owning_sysid,\n"
+" newval.owning_tlid,\n"
+" newval.owning_dboid,\n"
+" newval.owning_riname,\n"
+" newval.seqname,\n"
+" newval.seqschema,\n"
+" newval.seqrange\n"
+" FROM bdr_sequence_values newval\n"
+" WHERE\n"
+" newval.confirmed\n"
+" AND NOT newval.emptied\n"
+" AND NOT newval.in_use\n"
+" AND newval.owning_sysid = $1\n"
+" AND newval.owning_tlid = $2\n"
+" AND newval.owning_dboid = $3\n"
+" AND newval.owning_riname = $4\n"
+" AND newval.seqschema = $5\n"
+" AND newval.seqname = $6\n"
+" ORDER BY newval.seqrange ASC\n"
+" LIMIT 1\n"
+" FOR UPDATE\n"
+" )\n"
+"RETURNING\n"
+" lower(seqrange),\n"
+" upper(seqrange)\n"
+;
+
+static Size
+bdr_sequencer_shmem_size(void)
+{
+ Size size = 0;
+
+ size = add_size(size, sizeof(BdrSequencerControl));
+ size = add_size(size, mul_size(bdr_seq_nsequencers, sizeof(BdrSequencerSlot)));
+
+ return size;
+}
+
+static void
+bdr_sequencer_shmem_shutdown(int code, Datum arg)
+{
+ BdrSequencerSlot *slot;
+ if (bdr_sequencer_con == NULL)
+ return;
+
+ slot = &BdrSequencerCtl->slots[bdr_sequencer_con->slot];
+
+ slot->database_oid = InvalidOid;
+ slot->proclatch = NULL;
+}
+
+static void
+bdr_sequencer_shmem_startup(void)
+{
+ bool found;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+ BdrSequencerCtl = ShmemInitStruct("bdr_sequencer",
+ bdr_sequencer_shmem_size(),
+ &found);
+ if (!found)
+ {
+ /* initialize */
+ memset(BdrSequencerCtl, 0, bdr_sequencer_shmem_size());
+ }
+ LWLockRelease(AddinShmemInitLock);
+
+ on_shmem_exit(bdr_sequencer_shmem_shutdown, (Datum) 0);
+}
+
+void
+bdr_sequencer_shmem_init(int nnodes, int sequencers)
+{
+ Assert(process_shared_preload_libraries_in_progress);
+
+ bdr_seq_nnodes = nnodes;
+ bdr_seq_nsequencers = sequencers;
+
+ RequestAddinShmemSpace(bdr_sequencer_shmem_size());
+
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = bdr_sequencer_shmem_startup;
+}
+
+void
+bdr_sequencer_wakeup(void)
+{
+ int off;
+ BdrSequencerSlot *slot;
+
+
+ for (off = 0; off < bdr_seq_nnodes; off++)
+ {
+ slot = &BdrSequencerCtl->slots[off];
+
+ /* FIXME: locking! */
+ if (slot->database_oid == InvalidOid)
+ continue;
+
+ if (slot->database_oid != MyDatabaseId)
+ continue;
+
+ SetLatch(slot->proclatch);
+ }
+}
+
+void
+bdr_sequencer_init(void)
+{
+ CreateExtensionStmt create_stmt;
+ AlterExtensionStmt alter_stmt;
+ BdrSequencerSlot *slot;
+
+ Assert(bdr_sequencer_con != NULL);
+
+ slot = &BdrSequencerCtl->slots[bdr_sequencer_con->slot];
+ slot->database_oid = MyDatabaseId;
+ slot->proclatch = &MyProc->procLatch;
+
+ create_stmt.if_not_exists = true;
+ create_stmt.options = NIL;
+
+ alter_stmt.options = NIL;
+
+ StartTransactionCommand();
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ create_stmt.extname = (char *)"btree_gist";
+ alter_stmt.extname = (char *)"btree_gist";
+
+ /* create extension if not exists */
+ CreateExtension(&create_stmt);
+ /* update extension otherwise */
+ ExecAlterExtensionStmt(&alter_stmt);
+
+ create_stmt.extname = (char *)"bdr";
+ alter_stmt.extname = (char *)"bdr";
+ /* create extension if not exists */
+ CreateExtension(&create_stmt);
+ /* update extension otherwise */
+ ExecAlterExtensionStmt(&alter_stmt);
+
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+}
+
+static void
+bdr_sequencer_lock_rel(char *relname)
+{
+ Oid nspoid;
+ Oid relid;
+
+ nspoid = get_namespace_oid("bdr", false);
+ relid = get_relname_relid(relname, nspoid);
+ if (!relid)
+ elog(ERROR, "cache lookup failed for relation public.%s", relname);
+
+ SetCurrentStatementStartTimestamp();
+ pgstat_report_activity(STATE_RUNNING, relname);
+ LockRelationOid(relid, ExclusiveLock);
+}
+
+static void
+bdr_sequencer_lock(void)
+{
+
+ bdr_sequencer_lock_rel("bdr_sequence_elections");
+ bdr_sequencer_lock_rel("bdr_sequence_values");
+ bdr_sequencer_lock_rel("bdr_votes");
+}
+
+void
+bdr_sequencer_vote(void)
+{
+ Oid argtypes[4];
+ Datum values[4];
+ bool nulls[4];
+ char local_sysid[32];
+ int ret;
+ int my_processed;
+
+ snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+ GetSystemIdentifier());
+
+ argtypes[0] = TEXTOID;
+ nulls[0] = false;
+ values[0] = CStringGetTextDatum(local_sysid);
+
+ argtypes[1] = OIDOID;
+ nulls[1] = false;
+ values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+ argtypes[2] = OIDOID;
+ values[2] = ObjectIdGetDatum(MyDatabaseId);
+ nulls[2] = false;
+
+ argtypes[3] = TEXTOID;
+ values[3] = CStringGetTextDatum("");
+ nulls[3] = false;
+
+ StartTransactionCommand();
+ SPI_connect();
+
+ bdr_sequencer_lock();
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+again:
+ SetCurrentStatementStartTimestamp();
+ pgstat_report_activity(STATE_RUNNING, "sequence voting");
+ ret = SPI_execute_with_args(vote_sql, 4, argtypes,
+ values, nulls, false, 0);
+
+ if (ret != SPI_OK_INSERT)
+ elog(ERROR, "blub");
+ my_processed = SPI_processed;
+ elog(LOG, "started %d votes", my_processed);
+
+ if (my_processed > 0)
+ goto again;
+
+ PopActiveSnapshot();
+ SPI_finish();
+ CommitTransactionCommand();
+
+}
+
+/*
+ * Check whether we need to initiate a voting procedure for getting new
+ * sequence chunks.
+ */
+void
+bdr_sequencer_start_elections(void)
+{
+ Oid argtypes[4];
+ Datum values[4];
+ bool nulls[4];
+ char local_sysid[32];
+ int ret;
+
+ snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+ GetSystemIdentifier());
+
+ StartTransactionCommand();
+ SPI_connect();
+
+ bdr_sequencer_lock();
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ argtypes[0] = TEXTOID;
+ nulls[0] = false;
+ values[0] = CStringGetTextDatum(local_sysid);
+
+ argtypes[1] = OIDOID;
+ nulls[1] = false;
+ values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+ argtypes[2] = OIDOID;
+ values[2] = ObjectIdGetDatum(MyDatabaseId);
+ nulls[2] = false;
+
+ argtypes[3] = TEXTOID;
+ values[3] = CStringGetTextDatum("");
+ nulls[3] = false;
+
+ SetCurrentStatementStartTimestamp();
+ pgstat_report_activity(STATE_RUNNING, "start_elections");
+ ret = SPI_execute_with_args(start_elections_sql, 4, argtypes,
+ values, nulls, false, 0);
+
+ if (ret != SPI_OK_INSERT_RETURNING)
+ elog(ERROR, "blub");
+
+ elog(LOG, "started %d elections", SPI_processed);
+
+ PopActiveSnapshot();
+ SPI_finish();
+ CommitTransactionCommand();
+}
+
+/*
+ * Check whether enough votes have come in for any of *our* in progress
+ * elections.
+ */
+void
+bdr_sequencer_tally(void)
+{
+ Oid argtypes[5];
+ Datum values[5];
+ bool nulls[5];
+ char local_sysid[32];
+ int ret;
+
+ snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+ GetSystemIdentifier());
+
+ StartTransactionCommand();
+ SPI_connect();
+
+ bdr_sequencer_lock();
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ argtypes[0] = TEXTOID;
+ nulls[0] = false;
+ values[0] = CStringGetTextDatum(local_sysid);
+
+ argtypes[1] = OIDOID;
+ nulls[1] = false;
+ values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+ argtypes[2] = OIDOID;
+ values[2] = ObjectIdGetDatum(MyDatabaseId);
+ nulls[2] = false;
+
+ argtypes[3] = TEXTOID;
+ values[3] = CStringGetTextDatum("");
+ nulls[3] = false;
+
+ argtypes[4] = INT4OID;
+ values[4] = Int32GetDatum(bdr_sequencer_con->num_nodes);
+ nulls[4] = false;
+
+ SetCurrentStatementStartTimestamp();
+ pgstat_report_activity(STATE_RUNNING, "tally_elections");
+ ret = SPI_execute_with_args(tally_elections_sql, 5, argtypes,
+ values, nulls, false, 0);
+
+ if (ret != SPI_OK_SELECT)
+ elog(ERROR, "blub");
+
+ elog(LOG, "tallied %d elections", SPI_processed);
+
+ PopActiveSnapshot();
+ SPI_finish();
+ CommitTransactionCommand();
+}
+
+
+static int
+bdr_sequence_value_cmp(const void *a, const void *b)
+{
+ const BdrSequenceValues *left = a;
+ const BdrSequenceValues *right = b;
+
+ if (left->start_value < right->start_value)
+ return -1;
+ if (left->start_value == right->start_value)
+ return 0;
+ return 1;
+}
+
+/*
+ * Replace a single (uninitialized or used up) chunk by a free one. Mark the
+ * new chunk from bdr_sequence_values as in_use.
+ *
+ * Returns whether we could find a chunk or not.
+ */
+static bool
+bdr_sequencer_fill_chunk(Oid seqoid, char *seqschema, char *seqname,
+ BdrSequenceValues *curval)
+{
+ Oid argtypes[6];
+ Datum values[6];
+ bool nulls[6];
+ char local_sysid[32];
+ int ret;
+ int64 lower, upper;
+ bool success;
+
+ SPI_push();
+ SPI_connect();
+
+ snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+ GetSystemIdentifier());
+
+ argtypes[0] = TEXTOID;
+ nulls[0] = false;
+ values[0] = CStringGetTextDatum(local_sysid);
+
+ argtypes[1] = OIDOID;
+ nulls[1] = false;
+ values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+ argtypes[2] = OIDOID;
+ values[2] = ObjectIdGetDatum(MyDatabaseId);
+ nulls[2] = false;
+
+ argtypes[3] = TEXTOID;
+ values[3] = CStringGetTextDatum("");
+ nulls[3] = false;
+
+ argtypes[4] = TEXTOID;
+ values[4] = CStringGetTextDatum(seqschema);
+ nulls[4] = false;
+
+ argtypes[5] = TEXTOID;
+ values[5] = CStringGetTextDatum(seqname);
+ nulls[5] = false;
+
+ SetCurrentStatementStartTimestamp();
+ pgstat_report_activity(STATE_RUNNING, "get_chunk");
+
+ ret = SPI_execute_with_args(get_chunk_sql, 6, argtypes,
+ values, nulls, false, 0);
+ if (ret != SPI_OK_UPDATE_RETURNING)
+ elog(ERROR, "blart");
+
+ if (SPI_processed != 1)
+ {
+ elog(NOTICE, "no free chunks for sequence %s.%s",
+ seqschema, seqname);
+ success = false;
+ }
+ else
+ {
+ HeapTuple tup = SPI_tuptable->vals[0];
+ bool isnull;
+
+ lower = DatumGetInt64(SPI_getbinval(tup, SPI_tuptable->tupdesc, 1, &isnull));
+ Assert(!isnull);
+ upper = DatumGetInt64(SPI_getbinval(tup, SPI_tuptable->tupdesc, 2, &isnull));
+ Assert(!isnull);
+
+ elog(NOTICE, "got chunk [%zu, %zu) for sequence %s.%s",
+ lower, upper, seqschema, seqname);
+ curval->start_value = lower;
+ curval->next_value = lower;
+ curval->end_value = upper;
+
+ success = true;
+ }
+ SPI_finish();
+ SPI_pop();
+
+ return success;
+}
+
+/*
+ * Search for used up chunks in one bdr sequence.
+ */
+static void
+bdr_sequencer_fill_sequence(Oid seqoid, char *seqschema, char *seqname)
+{
+ Buffer buf;
+ SeqTable elm;
+ Relation rel;
+ HeapTupleData seqtuple;
+ Datum values[SEQ_COL_LASTCOL];
+ bool nulls[SEQ_COL_LASTCOL];
+ HeapTuple newtup;
+ Page page, temppage;
+ BdrSequenceValues *curval, *firstval;
+ int i;
+ bool acquired_new = false;
+
+ elog(LOG, "checking sequence %u: %s.%s",
+ seqoid, seqschema, seqname);
+
+ /* lock page, fill heaptup */
+ init_sequence(seqoid, &elm, &rel);
+ (void) read_seq_tuple(elm, rel, &buf, &seqtuple);
+
+ /* get values */
+ heap_deform_tuple(&seqtuple, RelationGetDescr(rel),
+ values, nulls);
+
+ /* now make sure we have space for our own data */
+ if (nulls[SEQ_COL_AMDATA - 1])
+ {
+ struct varlena *vl = palloc0(VARHDRSZ + sizeof(BdrSequenceValues) * 10);
+ SET_VARSIZE(vl, VARHDRSZ + sizeof(BdrSequenceValues) * 10);
+ nulls[SEQ_COL_AMDATA - 1] = false;
+ values[SEQ_COL_AMDATA - 1] = PointerGetDatum(vl);
+ }
+
+ firstval = (BdrSequenceValues *)
+ VARDATA_ANY(DatumGetByteaP(values[SEQ_COL_AMDATA - 1]));
+ curval = firstval;
+
+ START_CRIT_SECTION();
+
+ MarkBufferDirty(buf);
+
+ for (i = 0; i < 10; i ++)
+ {
+ if (curval->next_value == curval->end_value)
+ {
+ if (curval->end_value > 0)
+ elog(LOG, "used up old chunk");
+
+ elog(LOG, "need new batch %i", i);
+ if (bdr_sequencer_fill_chunk(seqoid, seqschema, seqname, curval))
+ acquired_new = true;
+ else
+ break;
+ }
+ curval++;
+ }
+
+ if (!acquired_new)
+ goto done_with_sequence;
+
+ /* sort chunks, so we always use the smallest one first */
+ qsort(firstval, 10, sizeof(BdrSequenceValues), bdr_sequence_value_cmp);
+
+ newtup = heap_form_tuple(RelationGetDescr(rel), values, nulls);
+
+ /* special requirements for sequence tuples */
+ HeapTupleHeaderSetXmin(newtup->t_data, FrozenTransactionId);
+ newtup->t_data->t_infomask |= HEAP_XMIN_COMMITTED;
+ newtup->t_data->t_infomask |= HEAP_XMAX_INVALID;
+
+ page = BufferGetPage(buf);
+ temppage = PageGetTempPage(page);
+
+ /* replace page contents, the direct way */
+ PageInit(temppage, BufferGetPageSize(buf), PageGetSpecialSize(page));
+ memcpy(PageGetSpecialPointer(temppage),
+ PageGetSpecialPointer(page),
+ PageGetSpecialSize(page));
+
+ if (PageAddItem(temppage, (Item) newtup->t_data, newtup->t_len,
+ FirstOffsetNumber, false, false) == InvalidOffsetNumber)
+ elog(PANIC, "fill_sequence: failed to add item to page");
+
+ PageSetLSN(temppage, PageGetLSN(page));
+
+ memcpy(page, temppage, BufferGetPageSize(buf));
+
+ seqtuple.t_len = newtup->t_len;
+
+ log_sequence_tuple(rel, &seqtuple, page);
+
+ END_CRIT_SECTION();
+
+done_with_sequence:
+ UnlockReleaseBuffer(buf);
+ heap_close(rel, NoLock);
+}
+
+/*
+ * Check whether all BDR sequences have enough values inline. If not, add
+ * some. This should be called after tallying (so we have a better chance to
+ * have enough chunks) but before starting new elections since we might use up
+ * existing chunks.
+ */
+void
+bdr_sequencer_fill_sequences(void)
+{
+ SPIPlanPtr plan;
+ Portal cursor;
+
+ StartTransactionCommand();
+ SPI_connect();
+
+ bdr_sequencer_lock();
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ plan = SPI_prepare(fill_sequences_sql, 0, NULL);
+ cursor = SPI_cursor_open("seq", plan, NULL, NULL, 0);
+
+ SetCurrentStatementStartTimestamp();
+ pgstat_report_activity(STATE_RUNNING, "fill_sequences");
+
+ SPI_cursor_fetch(cursor, true, 1);
+
+ while (SPI_processed > 0)
+ {
+ HeapTuple tup = SPI_tuptable->vals[0];
+ bool isnull;
+ Datum seqoid;
+ Datum seqschema;
+ Datum seqname;
+
+ seqoid = SPI_getbinval(tup, SPI_tuptable->tupdesc, 1, &isnull);
+ Assert(!isnull);
+ seqschema = SPI_getbinval(tup, SPI_tuptable->tupdesc, 2, &isnull);
+ Assert(!isnull);
+ seqname = SPI_getbinval(tup, SPI_tuptable->tupdesc, 3, &isnull);
+ Assert(!isnull);
+
+ bdr_sequencer_fill_sequence(DatumGetObjectId(seqoid),
+ NameStr(*DatumGetName(seqschema)),
+ NameStr(*DatumGetName(seqname)));
+
+ SPI_cursor_fetch(cursor, true, 1);
+ }
+
+ PopActiveSnapshot();
+ SPI_finish();
+ CommitTransactionCommand();
+}
+
+
+/* check sequence.c */
+#define SEQ_LOG_VALS 32
+
+PG_FUNCTION_INFO_V1(bdr_sequence_alloc);
+void
+bdr_sequence_alloc(PG_FUNCTION_ARGS)
+{
+ Relation seqrel = (Relation) PG_GETARG_POINTER(0);
+ SeqTable elm = (SeqTable) PG_GETARG_POINTER(1);
+ Buffer buf = (Buffer) PG_GETARG_INT32(2);
+ HeapTuple seqtuple = (HeapTuple) PG_GETARG_POINTER(3);
+ Page page;
+ Form_pg_sequence seq;
+ bool logit = false;
+ int64 cache,
+ log,
+ fetch,
+ last;
+ int64 result = 0;
+ int64 next;
+ Datum values;
+ bool isnull;
+ BdrSequenceValues *curval;
+ int i;
+ bool wakeup = false;
+
+ page = BufferGetPage(buf);
+ seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
+
+ values = fastgetattr(seqtuple, 11, RelationGetDescr(seqrel), &isnull);
+ if (isnull)
+ elog(ERROR, "uninitialized sequence");
+
+ curval = (BdrSequenceValues *) VARDATA_ANY(DatumGetByteaP(values));
+
+ Assert(seq->increment_by == 1);
+ /* XXX: check min/max */
+
+ last = next = seq->last_value;
+
+ fetch = cache = seq->cache_value;
+ log = seq->log_cnt;
+
+ /* check whether value can be satisfied without logging again */
+ if (log < fetch || !seq->is_called || PageGetLSN(page) <= GetRedoRecPtr())
+ {
+ /* forced log to satisfy local demand for values */
+ fetch = log = fetch + SEQ_LOG_VALS;
+ logit = true;
+ }
+
+ /*
+ * try to fetch cache [+ log ] numbers, check all 10 possible chunks
+ */
+ for (i = 0; i < 10; i ++)
+ {
+ /* redo recovered after crash*/
+ if (seq->last_value >= curval->next_value &&
+ seq->last_value < curval->end_value)
+ {
+ curval->next_value = seq->last_value + 1;
+ }
+
+ /* chunk empty */
+ if (curval->next_value >= curval->end_value)
+ {
+ curval++;
+ continue;
+ }
+
+
+ /* there's space in current chunk, use it */
+ result = curval->next_value;
+
+ /* but not enough for all ..log values */
+ if (result + log >= curval->end_value)
+ {
+ log = curval->end_value - curval->next_value;
+ wakeup = true;
+ logit = true;
+ }
+
+ /* but not enough for all ..cached values */
+ last = result + cache - 1;
+ if (last >= curval->end_value)
+ {
+ last = curval->end_value - 1;
+ wakeup = true;
+ logit = true;
+ }
+
+ curval->next_value = last;
+ break;
+ }
+
+ if (result == 0)
+ {
+ bdr_sequencer_wakeup();
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not find free sequence value")));
+ }
+
+ if (wakeup)
+ bdr_sequencer_wakeup();
+
+ elm->last = result;
+ elm->cached = result;
+ elm->last_valid = true;
+
+ /* ready to change the on-disk (or really, in-buffer) tuple */
+ START_CRIT_SECTION();
+
+ /*
+ * We must mark the buffer dirty before doing XLogInsert(); see notes in
+ * SyncOneBuffer(). However, we don't apply the desired changes just yet.
+ * This looks like a violation of the buffer update protocol, but it is
+ * in fact safe because we hold exclusive lock on the buffer. Any other
+ * process, including a checkpoint, that tries to examine the buffer
+ * contents will block until we release the lock, and then will see the
+ * final state that we install below.
+ */
+ MarkBufferDirty(buf);
+
+ if (logit)
+ {
+ /*
+ * We don't log the current state of the tuple, but rather the state
+ * as it would appear after "log" more fetches. This lets us skip
+ * that many future WAL records, at the cost that we lose those
+ * sequence values if we crash.
+ */
+ seq->last_value = next;
+ seq->is_called = true;
+ seq->log_cnt = 0;
+ log_sequence_tuple(seqrel, seqtuple, page);
+ }
+
+ /* Now update sequence tuple to the intended final state */
+ seq->last_value = elm->last; /* last fetched number */
+ seq->is_called = true;
+ seq->log_cnt = log; /* how much is logged */
+
+ result = elm->last;
+
+ END_CRIT_SECTION();
+}
+
+PG_FUNCTION_INFO_V1(bdr_sequence_setval);
+void
+bdr_sequence_setval(PG_FUNCTION_ARGS)
+{
+ Relation seqrel = (Relation) PG_GETARG_POINTER(0);
+ Buffer buf = (Buffer) PG_GETARG_INT32(2);
+ HeapTuple seqtuple = (HeapTuple) PG_GETARG_POINTER(3);
+ int64 next = PG_GETARG_INT64(4);
+ bool iscalled = PG_GETARG_BOOL(5);
+ Page page = BufferGetPage(buf);
+ Form_pg_sequence seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
+
+ /* ready to change the on-disk (or really, in-buffer) tuple */
+ START_CRIT_SECTION();
+
+ /* set is_called, all AMs should need to do this */
+ seq->is_called = iscalled;
+ seq->last_value = next; /* last fetched number */
+ seq->log_cnt = 0;
+
+ MarkBufferDirty(buf);
+
+ log_sequence_tuple(seqrel, seqtuple, page);
+
+ END_CRIT_SECTION();
+}
+
+PG_FUNCTION_INFO_V1(bdr_sequence_options);
+Datum
+bdr_sequence_options(PG_FUNCTION_ARGS)
+{
+ Datum reloptions = PG_GETARG_DATUM(0);
+ bool validate = PG_GETARG_BOOL(1);
+ bytea *result;
+
+ result = default_reloptions(reloptions, validate, RELOPT_KIND_SEQUENCE);
+ if (result)
+ PG_RETURN_BYTEA_P(result);
+
+ PG_RETURN_NULL();
+}
--- /dev/null
+-- check extension creation works
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
+SELECT * FROM bdr.pg_stat_bdr;
+ rep_node_id | riremotesysid | riremotedb | rilocaldb | nr_commit | nr_rollback | nr_insert | nr_insert_conflict | nr_update | nr_update_conflict | nr_delete | nr_delete_conflict | nr_disconnect
+-------------+---------------+------------+-----------+-----------+-------------+-----------+--------------------+-----------+--------------------+-----------+--------------------+---------------
+(0 rows)
+
+-- check that only superusers can do this
+CREATE ROLE bdr_no_special_perms;
+SET ROLE bdr_no_special_perms;
+SELECT * FROM bdr.pg_stat_bdr;
+ERROR: permission denied for relation pg_stat_bdr
+SELECT * FROM bdr.pg_stat_get_bdr();
+ERROR: permission denied for function pg_stat_get_bdr
+-- reacquire permissions, drop extension, role
+RESET role;
+DROP EXTENSION bdr;
+DROP EXTENSION btree_gist;
+DROP ROLE bdr_no_special_perms;
--- /dev/null
+# contrib/bdr/output.mk
+
+MODULE_big = bdr_output
+OBJS = bdr_output.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK = $(libpq)
+SHLIB_PREREQS = submake-libpq
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/bdr
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
--- /dev/null
+-- check extension creation works
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
+SELECT * FROM bdr.pg_stat_bdr;
+
+-- check that only superusers can do this
+CREATE ROLE bdr_no_special_perms;
+SET ROLE bdr_no_special_perms;
+SELECT * FROM bdr.pg_stat_bdr;
+SELECT * FROM bdr.pg_stat_get_bdr();
+
+-- reacquire permissions, drop extension, role
+RESET role;
+DROP EXTENSION bdr;
+DROP EXTENSION btree_gist;
+DROP ROLE bdr_no_special_perms;
--- /dev/null
+# contrib/bdr/worker.mk
+
+MODULE_big = bdr
+OBJS = bdr.o bdr_apply.o bdr_count.o bdr_seq.o
+
+EXTENSION = bdr
+DATA = bdr--0.5.sql
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK = $(libpq)
+SHLIB_PREREQS = submake-libpq
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/bdr
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+# Disabled because these tests require "wal_level=logical", which
+# typical installcheck users do not have (e.g. buildfarm clients).
+installcheck:;
+
+submake-regress:
+ $(MAKE) -C $(top_builddir)/src/test/regress
+
+submake-btree_gist:
+ $(MAKE) -C $(top_builddir)/contrib/btree_gist
+
+check: all | submake-regress submake-btree_gist
+ $(pg_regress_check) \
+ --temp-config $(top_srcdir)/contrib/bdr/bdr.conf \
+ --temp-install=./tmp_check \
+ --extra-install=contrib/btree_gist \
+ --extra-install=contrib/bdr \
+ extension
+
+PHONY: submake-regress
my $contrib_extrasource = {
'cube' => [ 'cubescan.l', 'cubeparse.y' ],
'seg' => [ 'segscan.l', 'segparse.y' ], };
-my @contrib_excludes = ('pgcrypto', 'intagg', 'sepgsql');
+my @contrib_excludes = ('pgcrypto', 'intagg', 'sepgsql', 'bdr');
sub mkvcbuild
{
my $mf = Project::read_file('contrib/pgcrypto/Makefile');
GenerateContribSqlFiles('pgcrypto', $mf);
+ # so is bdr
+ my $bdr_output = $solution->AddProject('bdr_output', 'dll', 'misc');
+ $bdr_output->AddFiles('contrib\bdr', 'bdr_output.c');
+ $bdr_output->AddReference($postgres);
+ $bdr_output->AddLibrary('wsock32.lib');
+
+ my $bdr_apply = $solution->AddProject('bdr_apply', 'dll', 'misc');
+ $bdr_apply->AddFiles('contrib\bdr', 'bdr.c', 'bdr_apply.c',
+ 'bdr_count.c', 'bdr_seq.c');
+ $bdr_apply->AddReference($postgres);
+ $bdr_apply->AddLibrary('wsock32.lib');
+ $bdr_apply->AddIncludeDir('src\interfaces\libpq');
+ $bdr_apply->AddReference($libpq);
+
my $D;
opendir($D, 'contrib') || croak "Could not opendir on contrib!\n";
while (my $d = readdir($D))