bdr: bdr plugin
authorAndres Freund <[email protected]>
Fri, 17 May 2013 16:15:37 +0000 (18:15 +0200)
committerAndres Freund <[email protected]>
Thu, 3 Jul 2014 15:55:18 +0000 (17:55 +0200)
Andres Freund, Alvaro Herrera, Abhijit Menon-Sen

17 files changed:
contrib/Makefile
contrib/bdr/Makefile [new file with mode: 0644]
contrib/bdr/bdr--0.5.sql [new file with mode: 0644]
contrib/bdr/bdr.c [new file with mode: 0644]
contrib/bdr/bdr.conf [new file with mode: 0644]
contrib/bdr/bdr.control [new file with mode: 0644]
contrib/bdr/bdr.h [new file with mode: 0644]
contrib/bdr/bdr_apply.c [new file with mode: 0644]
contrib/bdr/bdr_count.c [new file with mode: 0644]
contrib/bdr/bdr_count.sql [new file with mode: 0644]
contrib/bdr/bdr_output.c [new file with mode: 0644]
contrib/bdr/bdr_seq.c [new file with mode: 0644]
contrib/bdr/expected/extension.out [new file with mode: 0644]
contrib/bdr/output.mk [new file with mode: 0644]
contrib/bdr/sql/extension.sql [new file with mode: 0644]
contrib/bdr/worker.mk [new file with mode: 0644]
src/tools/msvc/Mkvcbuild.pm

index b37d0dd2c318f722628caef69c510b2ef99d9044..3db5b1ebe9a9353578099153b29986c3411f2d7b 100644 (file)
@@ -8,6 +8,7 @@ SUBDIRS = \
        adminpack   \
        auth_delay  \
        auto_explain    \
+       bdr     \
        btree_gin   \
        btree_gist  \
        chkpass     \
diff --git a/contrib/bdr/Makefile b/contrib/bdr/Makefile
new file mode 100644 (file)
index 0000000..87b79d0
--- /dev/null
@@ -0,0 +1,17 @@
+# contrib/worker_spi/Makefile
+
+subdir = contrib/bdr
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+
+%%: all
+
+all:
+   $(MAKE) -f $(top_srcdir)/contrib/bdr/output.mk $(MAKECMDGOALS)
+   $(MAKE) -f $(top_srcdir)/contrib/bdr/worker.mk $(MAKECMDGOALS)
+
+# phony target...
+check: all
+
+.PHONY: all
diff --git a/contrib/bdr/bdr--0.5.sql b/contrib/bdr/bdr--0.5.sql
new file mode 100644 (file)
index 0000000..0128eb5
--- /dev/null
@@ -0,0 +1,148 @@
+--\echo Use "CREATE EXTENSION bdr" to load this file. \quit
+
+--CREATE ROLE bdr NOLOGIN SUPERUSER;
+--SET ROLE bdr;
+
+CREATE SCHEMA bdr;
+GRANT USAGE ON SCHEMA bdr TO public;
+
+SET LOCAL search_path = bdr;
+
+CREATE FUNCTION pg_stat_get_bdr(
+    OUT rep_node_id oid,
+    OUT rilocalid oid,
+    OUT riremoteid text,
+    OUT nr_commit int8,
+    OUT nr_rollback int8,
+    OUT nr_insert int8,
+    OUT nr_insert_conflict int8,
+    OUT nr_update int8,
+    OUT nr_update_conflict int8,
+    OUT nr_delete int8,
+    OUT nr_delete_conflict int8,
+    OUT nr_disconnect int8
+)
+RETURNS SETOF record
+LANGUAGE C
+AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION pg_stat_get_bdr() FROM PUBLIC;
+
+CREATE VIEW pg_stat_bdr AS SELECT * FROM pg_stat_get_bdr();
+
+CREATE TABLE bdr_sequence_values
+(
+    owning_sysid text NOT NULL,
+    owning_tlid oid NOT NULL,
+    owning_dboid oid NOT NULL,
+    owning_riname text NOT NULL,
+
+    seqschema text NOT NULL,
+    seqname text NOT NULL,
+    seqrange int8range NOT NULL,
+
+    -- could not acquire chunk
+    failed bool NOT NULL DEFAULT false,
+
+    -- voting successfull
+    confirmed bool NOT NULL,
+
+    -- empty, not referenced
+    emptied bool NOT NULL CHECK(NOT emptied OR confirmed),
+
+    -- used in sequence
+    in_use bool NOT NULL CHECK(NOT in_use OR confirmed),
+
+    EXCLUDE USING gist(seqschema WITH =, seqname WITH =, seqrange WITH &&) WHERE (confirmed),
+    PRIMARY KEY(owning_sysid, owning_tlid, owning_dboid, owning_riname, seqschema, seqname, seqrange)
+);
+SELECT pg_catalog.pg_extension_config_dump('bdr_sequence_values', '');
+
+REVOKE ALL ON TABLE bdr_sequence_values FROM PUBLIC;
+
+CREATE INDEX bdr_sequence_values_chunks ON bdr_sequence_values(seqschema, seqname, seqrange);
+CREATE INDEX bdr_sequence_values_newchunk ON bdr_sequence_values(seqschema, seqname, upper(seqrange));
+
+CREATE TABLE bdr_sequence_elections
+(
+    owning_sysid text NOT NULL,
+    owning_tlid oid NOT NULL,
+    owning_dboid oid NOT NULL,
+    owning_riname text NOT NULL,
+    owning_election_id bigint NOT NULL,
+
+    seqschema text NOT NULL,
+    seqname text NOT NULL,
+    seqrange int8range NOT NULL,
+
+    /* XXX id */
+
+    vote_type text NOT NULL,
+
+    open bool NOT NULL,
+    success bool NOT NULL DEFAULT false,
+
+    PRIMARY KEY(owning_sysid, owning_tlid, owning_dboid, owning_riname, seqschema, seqname, seqrange)
+);
+SELECT pg_catalog.pg_extension_config_dump('bdr_sequence_elections', '');
+REVOKE ALL ON TABLE bdr_sequence_values FROM PUBLIC;
+
+
+CREATE TABLE bdr_votes
+(
+    vote_sysid text NOT NULL,
+    vote_tlid oid NOT NULL,
+    vote_dboid oid NOT NULL,
+    vote_riname text NOT NULL,
+    vote_election_id bigint NOT NULL,
+
+    voter_sysid text NOT NULL,
+    voter_tlid oid NOT NULL,
+    voter_dboid bigint NOT NULL,
+    voter_riname text NOT NULL,
+
+    vote bool NOT NULL,
+    reason text CHECK (reason IS NULL OR vote = false),
+    UNIQUE(vote_sysid, vote_tlid, vote_dboid, vote_riname, vote_election_id, voter_sysid, voter_tlid, voter_dboid, voter_riname)
+);
+SELECT pg_catalog.pg_extension_config_dump('bdr_votes', '');
+REVOKE ALL ON TABLE bdr_votes FROM PUBLIC;
+
+CREATE OR REPLACE FUNCTION bdr_sequence_alloc(INTERNAL)
+ RETURNS INTERNAL
+ LANGUAGE C
+ STABLE STRICT
+AS 'MODULE_PATHNAME'
+;
+
+CREATE OR REPLACE FUNCTION bdr_sequence_setval(INTERNAL)
+ RETURNS INTERNAL
+ LANGUAGE C
+ STABLE STRICT
+AS 'MODULE_PATHNAME'
+;
+
+CREATE OR REPLACE FUNCTION bdr_sequence_options(INTERNAL)
+ RETURNS INTERNAL
+ LANGUAGE C
+ STABLE STRICT
+AS 'MODULE_PATHNAME'
+;
+
+-- not tracked yet, can we trick pg_depend instead?
+DELETE FROM pg_seqam WHERE seqamname = 'bdr';
+
+INSERT INTO pg_seqam(
+    seqamname,
+    seqamalloc,
+    seqamsetval,
+    seqamoptions
+)
+VALUES (
+    'bdr',
+    'bdr_sequence_alloc',
+    'bdr_sequence_setval',
+    'bdr_sequence_options'
+);
+
+RESET search_path;
diff --git a/contrib/bdr/bdr.c b/contrib/bdr/bdr.c
new file mode 100644 (file)
index 0000000..c990dab
--- /dev/null
@@ -0,0 +1,817 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr.c
+ *     Replication!!!
+ *
+ * Replication???
+ *
+ * Copyright (C) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     contrib/bdr/bdr.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+/* These are always necessary for a bgworker */
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+
+/* these headers are used by this particular worker's code */
+#include "pgstat.h"
+
+#include "access/committs.h"
+#include "access/xact.h"
+#include "catalog/pg_index.h"
+#include "lib/stringinfo.h"
+#include "replication/replication_identifier.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+/* sequencer */
+#include "commands/extension.h"
+
+/* apply */
+#include "libpq-fe.h"
+
+#define MAXCONNINFO        1024
+
+static bool got_sigterm = false;
+ResourceOwner bdr_saved_resowner;
+static char *connections = NULL;
+static char *bdr_synchronous_commit = NULL;
+
+BDRWorkerCon *bdr_apply_con = NULL;
+BDRSequencerCon *bdr_sequencer_con = NULL;
+
+PG_MODULE_MAGIC;
+
+void       _PG_init(void);
+
+/*
+ * Converts an int64 to network byte order.
+ */
+static void
+sendint64(int64 i, char *buf)
+{
+   uint32      n32;
+
+   /* High order half first, since we're doing MSB-first */
+   n32 = (uint32) (i >> 32);
+   n32 = htonl(n32);
+   memcpy(&buf[0], &n32, 4);
+
+   /* Now the low order half */
+   n32 = (uint32) i;
+   n32 = htonl(n32);
+   memcpy(&buf[4], &n32, 4);
+}
+
+/*
+ * Converts an int64 from network byte order to native format.
+ *
+ * FIXME: replace with pq_getmsgint64
+ */
+static int64
+recvint64(char *buf)
+{
+   int64       result;
+   uint32      h32;
+   uint32      l32;
+
+   memcpy(&h32, buf, 4);
+   memcpy(&l32, buf + 4, 4);
+   h32 = ntohl(h32);
+   l32 = ntohl(l32);
+
+   result = h32;
+   result <<= 32;
+   result |= l32;
+
+   return result;
+}
+
+/*
+ * Send a Standby Status Update message to server.
+ */
+static bool
+sendFeedback(PGconn *conn, XLogRecPtr blockpos, int64 now, bool replyRequested,
+            bool force)
+{
+   char        replybuf[1 + 8 + 8 + 8 + 8 + 1];
+   int         len = 0;
+   static XLogRecPtr lastpos = InvalidXLogRecPtr;
+
+   Assert(blockpos != InvalidXLogRecPtr);
+
+   if (!force && (blockpos <= lastpos))
+       return true;
+
+   if (blockpos < lastpos)
+       blockpos = lastpos;
+
+   replybuf[len] = 'r';
+   len += 1;
+   sendint64(blockpos, &replybuf[len]);        /* write */
+   len += 8;
+   sendint64(blockpos, &replybuf[len]);        /* flush */
+   len += 8;
+   sendint64(blockpos, &replybuf[len]);        /* apply */
+   len += 8;
+   sendint64(now, &replybuf[len]);     /* sendTime */
+   len += 8;
+   replybuf[len] = replyRequested ? 1 : 0;     /* replyRequested */
+   len += 1;
+
+   elog(LOG, "sending feedback (force %d, reply requested %d) to %X/%X",
+        force, replyRequested,
+        (uint32) (blockpos >> 32), (uint32) blockpos);
+
+   lastpos = blockpos;
+
+   if (PQputCopyData(conn, replybuf, len) <= 0 || PQflush(conn))
+   {
+       elog(ERROR, "could not send feedback packet: %s",
+            PQerrorMessage(conn));
+       return false;
+   }
+
+   return true;
+}
+
+static void
+bdr_sigterm(SIGNAL_ARGS)
+{
+   int         save_errno = errno;
+
+   got_sigterm = true;
+   if (MyProc)
+       SetLatch(&MyProc->procLatch);
+
+   errno = save_errno;
+}
+
+static void
+bdr_sighup(SIGNAL_ARGS)
+{
+   elog(LOG, "got sighup!");
+   if (MyProc)
+       SetLatch(&MyProc->procLatch);
+}
+
+static void
+process_remote_action(char *data, size_t r)
+{
+   char        action;
+
+   action = data[0];
+   data += 1;
+   r--;
+
+   switch (action)
+   {
+           /* BEGIN */
+       case 'B':
+           process_remote_begin(data, r);
+           break;
+           /* COMMIT */
+       case 'C':
+           process_remote_commit(data, r);
+           break;
+           /* INSERT */
+       case 'I':
+           process_remote_insert(data, r);
+           break;
+           /* UPDATE */
+       case 'U':
+           process_remote_update(data, r);
+           break;
+           /* DELETE */
+       case 'D':
+           process_remote_delete(data, r);
+           break;
+       default:
+           elog(ERROR, "unknown action of type %c", action);
+   }
+}
+
+static void
+bdr_apply_main(Datum main_arg)
+{
+   PGconn     *streamConn;
+   PGresult   *res;
+   int         fd;
+   char       *remote_sysid;
+   uint64      remote_sysid_i;
+   char       *remote_tlid;
+   TimeLineID  remote_tlid_i;
+
+#ifdef NOT_USED
+   char       *remote_dbname;
+#endif
+   char       *remote_dboid;
+   Oid         remote_dboid_i;
+   char        local_sysid[32];
+   char        remote_ident[256];
+   char        query[256];
+   char        conninfo_repl[MAXCONNINFO + 75];
+   XLogRecPtr  last_received = InvalidXLogRecPtr;
+   char       *sqlstate;
+   NameData    replication_name;
+   RepNodeId   replication_identifier;
+   XLogRecPtr  start_from;
+   NameData    slot_name;
+
+   bdr_apply_con = (BDRWorkerCon *) DatumGetPointer(main_arg);
+
+   NameStr(replication_name)[0] = '\0';
+
+   /* Establish signal handlers before unblocking signals. */
+   pqsignal(SIGHUP, bdr_sighup);
+   pqsignal(SIGTERM, bdr_sigterm);
+
+   /* We're now ready to receive signals */
+   BackgroundWorkerUnblockSignals();
+
+   /* Connect to our database */
+   BackgroundWorkerInitializeConnection(bdr_apply_con->dbname, NULL);
+
+   /* always work in our own schema */
+   SetConfigOption("search_path", "bdr, pg_catalog",
+                   PGC_BACKEND, PGC_S_OVERRIDE);
+
+   /* setup synchronous commit according to the user's wishes */
+   if (bdr_synchronous_commit != NULL)
+       SetConfigOption("synchronous_commit", bdr_synchronous_commit,
+                       PGC_BACKEND, PGC_S_OVERRIDE);   /* other context? */
+
+   CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr apply top-level resource owner");
+   bdr_saved_resowner = CurrentResourceOwner;
+
+   snprintf(conninfo_repl, sizeof(conninfo_repl),
+            "%s replication=database fallback_application_name=bdr",
+            bdr_apply_con->dsn);
+
+   elog(LOG, "%s initialized on %s, remote %s",
+        MyBgworkerEntry->bgw_name, bdr_apply_con->dbname, conninfo_repl);
+
+   streamConn = PQconnectdb(conninfo_repl);
+   if (PQstatus(streamConn) != CONNECTION_OK)
+   {
+       ereport(FATAL,
+               (errmsg("could not connect to the primary server: %s",
+                       PQerrorMessage(streamConn))));
+   }
+
+
+   res = PQexec(streamConn, "IDENTIFY_SYSTEM");
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+   {
+       elog(FATAL, "could not send replication command \"%s\": %s",
+            "IDENTIFY_SYSTEM", PQerrorMessage(streamConn));
+   }
+   if (PQntuples(res) != 1 || PQnfields(res) != 5)
+   {
+       elog(FATAL, "could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n",
+            PQntuples(res), PQnfields(res), 1, 5);
+   }
+
+   remote_sysid = PQgetvalue(res, 0, 0);
+   remote_tlid = PQgetvalue(res, 0, 1);
+#ifdef NOT_USED
+   remote_dbname = PQgetvalue(res, 0, 3);
+#endif
+   remote_dboid = PQgetvalue(res, 0, 4);
+
+   if (sscanf(remote_sysid, UINT64_FORMAT, &remote_sysid_i) != 1)
+       elog(ERROR, "could not parse remote sysid %s", remote_sysid);
+
+   if (sscanf(remote_tlid, "%u", &remote_tlid_i) != 1)
+       elog(ERROR, "could not parse remote tlid %s", remote_tlid);
+
+   if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
+       elog(ERROR, "could not parse remote database OID %s", remote_dboid);
+
+   snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+            GetSystemIdentifier());
+
+   if (strcmp(remote_sysid, local_sysid) == 0)
+   {
+       ereport(FATAL,
+               (errmsg("system identifiers must differ between the nodes"),
+                errdetail("Both system identifiers are %s.", remote_sysid)));
+   }
+   else
+       elog(LOG, "local sysid %s, remote: %s",
+            local_sysid, remote_sysid);
+
+   /*
+    * build slot name.
+    *
+    * FIXME: This might truncate the identifier if replication_name is
+    * somewhat longer...
+    */
+   snprintf(NameStr(slot_name), NAMEDATALEN, "bdr_%u_%s_%u_%u__%s",
+            remote_dboid_i, local_sysid, ThisTimeLineID,
+            MyDatabaseId, NameStr(replication_name));
+   NameStr(slot_name)[NAMEDATALEN - 1] = '\0';
+
+   /*
+    * Build replication identifier.
+    */
+   snprintf(remote_ident, sizeof(remote_ident), "bdr_"UINT64_FORMAT"_%u_%u_%u_%s",
+            remote_sysid_i, remote_tlid_i, remote_dboid_i, MyDatabaseId, NameStr(replication_name));
+
+   StartTransactionCommand();
+
+   replication_identifier = GetReplicationIdentifier(remote_ident, true);
+
+   CommitTransactionCommand();
+
+   /* no parts of IDENTIFY_SYSTEM's response needed anymore */
+   PQclear(res);
+
+   if (OidIsValid(replication_identifier))
+       elog(LOG, "found valid replication identifier %u", replication_identifier);
+   /* create local replication identifier and a remote slot */
+   else
+   {
+       elog(LOG, "lookup failed, create new identifier");
+       /* doing this really safely would require 2pc... */
+       StartTransactionCommand();
+
+       /* we want the new identifier on stable storage immediately */
+       ForceSyncCommit();
+
+       /* acquire remote decoding slot */
+       snprintf(query, sizeof(query), "CREATE_REPLICATION_SLOT \"%s\" LOGICAL %s",
+                NameStr(slot_name), "bdr_output");
+       res = PQexec(streamConn, query);
+
+       if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       {
+           elog(FATAL, "could not send replication command \"%s\": status %s: %s\n",
+                query, PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+       }
+       PQclear(res);
+
+       /* acquire new local identifier, but don't commit */
+       replication_identifier = CreateReplicationIdentifier(remote_ident);
+
+       /* now commit local identifier */
+       CommitTransactionCommand();
+       CurrentResourceOwner = bdr_saved_resowner;
+       elog(LOG, "created replication identifier %u", replication_identifier);
+
+       /*
+        * FIXME: set current replication progress from upstream IFF it has not
+        * been set yet. Another worker might have cloned from upstream and set
+        * the progress for all nodes.
+        */
+   }
+
+   bdr_apply_con->origin_id = replication_identifier;
+   bdr_apply_con->sysid = remote_sysid_i;
+   bdr_apply_con->timeline = remote_tlid_i;
+
+   /* initialize stat subsystem, our id won't change further */
+   bdr_count_set_current_node(replication_identifier);
+
+   /*
+    * tell replication_identifier.c about our identifier so it can cache the
+    * search in shared memory.
+    */
+   SetupCachedReplicationIdentifier(replication_identifier);
+
+   /*
+    * Check whether we already replayed something so we don't replay it
+    * multiple times.
+    */
+   start_from = RemoteCommitFromCachedReplicationIdentifier();
+
+   elog(LOG, "starting up replication at %u from %X/%X",
+        replication_identifier,
+        (uint32) (start_from >> 32), (uint32) start_from);
+
+   snprintf(query, sizeof(query), "START_REPLICATION SLOT \"%s\" LOGICAL %X/%X",
+      NameStr(slot_name), (uint32) (start_from >> 32), (uint32) start_from);
+   res = PQexec(streamConn, query);
+
+   sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+
+   if (PQresultStatus(res) != PGRES_COPY_BOTH)
+   {
+       elog(FATAL, "could not send replication command \"%s\": %s\n, sqlstate: %s",
+            query, PQresultErrorMessage(res), sqlstate);
+   }
+   PQclear(res);
+
+   fd = PQsocket(streamConn);
+
+   replication_origin_id = replication_identifier;
+
+   while (!got_sigterm)
+   {
+       /* int       ret; */
+       int         rc;
+       int         r;
+       char       *copybuf = NULL;
+
+       /*
+        * Background workers mustn't call usleep() or any direct equivalent:
+        * instead, they may wait on their process latch, which sleeps as
+        * necessary, but is awakened if postmaster dies.  That way the
+        * background process goes away immediately in an emergency.
+        */
+       rc = WaitLatchOrSocket(&MyProc->procLatch,
+                              WL_SOCKET_READABLE | WL_LATCH_SET |
+                              WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                              fd, 1000L);
+
+       ResetLatch(&MyProc->procLatch);
+
+       /* emergency bailout if postmaster has died */
+       if (rc & WL_POSTMASTER_DEATH)
+           proc_exit(1);
+
+       if (PQstatus(streamConn) == CONNECTION_BAD)
+       {
+           bdr_count_disconnect();
+           elog(ERROR, "connection to other side has died");
+       }
+
+       if (rc & WL_SOCKET_READABLE)
+           PQconsumeInput(streamConn);
+
+       for (;;)
+       {
+           if (got_sigterm)
+               break;
+
+           r = PQgetCopyData(streamConn, &copybuf, 1);
+
+           if (r == -1)
+           {
+               elog(LOG, "data stream ended");
+               return;
+           }
+           else if (r == -2)
+           {
+               elog(ERROR, "could not read COPY data: %s",
+                    PQerrorMessage(streamConn));
+           }
+           else if (r < 0)
+               elog(ERROR, "invalid COPY status %d", r);
+           else if (r == 0)
+           {
+               /* need to wait for new data */
+               break;
+           }
+           else
+           {
+               if (copybuf[0] == 'w')
+               {
+                   int         hdr_len = 0;
+                   char       *data;
+                   XLogRecPtr  start_lsn;
+                   XLogRecPtr  end_lsn;
+
+                   hdr_len = 1;    /* msgtype 'w' */
+
+                   start_lsn = recvint64(&copybuf[hdr_len]);
+
+                   hdr_len += 8;       /* dataStart */
+
+                   end_lsn = recvint64(&copybuf[hdr_len]);
+
+                   hdr_len += 8;       /* walEnd */
+                   hdr_len += 8;       /* sendTime */
+
+                   if (last_received < start_lsn)
+                       last_received = start_lsn;
+
+                   if (last_received < end_lsn)
+                       last_received = end_lsn;
+
+                   data = copybuf + hdr_len;
+
+                   process_remote_action(data, r);
+               }
+               else if (copybuf[0] == 'k')
+               {
+                   XLogRecPtr  temp;
+
+                   temp = recvint64(&copybuf[1]);
+
+                   sendFeedback(streamConn, temp,
+                                GetCurrentTimestamp(), false, true);
+               }
+               /* other message types are purposefully ignored */
+           }
+       }
+
+       /* confirm all writes at once */
+       /*
+        * FIXME: we should only do that after an xlog flush... Yuck.
+        */
+       if (last_received != InvalidXLogRecPtr)
+           sendFeedback(streamConn, last_received,
+                        GetCurrentTimestamp(), false, false);
+   }
+
+   proc_exit(0);
+}
+
+static void
+bdr_sequencer_main(Datum main_arg)
+{
+   int rc;
+
+   bdr_sequencer_con = (BDRSequencerCon *) DatumGetPointer(main_arg);
+
+   /* Establish signal handlers before unblocking signals. */
+   pqsignal(SIGHUP, bdr_sighup);
+   pqsignal(SIGTERM, bdr_sigterm);
+
+   /* We're now ready to receive signals */
+   BackgroundWorkerUnblockSignals();
+
+   /* Connect to our database */
+   BackgroundWorkerInitializeConnection(bdr_sequencer_con->dbname, NULL);
+
+   /* always work in our own schema */
+   SetConfigOption("search_path", "bdr, pg_catalog",
+                   PGC_BACKEND, PGC_S_OVERRIDE);
+
+   CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr seq top-level resource owner");
+   bdr_saved_resowner = CurrentResourceOwner;
+
+   elog(WARNING, "starting sequencer on db \"%s\"", bdr_sequencer_con->dbname);
+
+   /* make sure BDR extension exists */
+   bdr_sequencer_init();
+
+   while (!got_sigterm)
+   {
+       /*
+        * Background workers mustn't call usleep() or any direct equivalent:
+        * instead, they may wait on their process latch, which sleeps as
+        * necessary, but is awakened if postmaster dies.  That way the
+        * background process goes away immediately in an emergency.
+        */
+       rc = WaitLatch(&MyProc->procLatch,
+                      WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                      10000L);
+
+       ResetLatch(&MyProc->procLatch);
+
+       /* emergency bailout if postmaster has died */
+       if (rc & WL_POSTMASTER_DEATH)
+           proc_exit(1);
+
+       /* check whether we need to vote */
+       bdr_sequencer_vote();
+
+       /* check whether any of our elections needs to be tallied */
+       bdr_sequencer_tally();
+
+       /* check all bdr sequences for used up chunks */
+       bdr_sequencer_fill_sequences();
+
+       /* check whether we need to start new elections */
+       bdr_sequencer_start_elections();
+       pgstat_report_activity(STATE_IDLE, NULL);
+   }
+
+   proc_exit(0);
+}
+
+/*
+ * Entrypoint of this module.
+ */
+void
+_PG_init(void)
+{
+   BackgroundWorker apply_worker;
+   BackgroundWorker sequencer_worker;
+   List       *cons;
+   ListCell   *c;
+   MemoryContext old_context;
+   Size        nregistered = 0;
+
+   char      **used_databases;
+   Size        num_used_databases = 0;
+
+   size_t      off;
+   bool        found;
+
+   if (!process_shared_preload_libraries_in_progress)
+       elog(ERROR, "bdr can only be loaded via shared_preload_libraries");
+
+   if (!commit_ts_enabled)
+       ereport(ERROR,
+               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                errmsg("bdr requires \"track_commit_timestamp\" to be enabled")));
+
+   /* guc's et al need to survive this */
+   old_context = MemoryContextSwitchTo(TopMemoryContext);
+
+   DefineCustomStringVariable("bdr.connections",
+                              "List of connections",
+                              NULL,
+                              &connections,
+                              NULL, PGC_POSTMASTER,
+                              GUC_LIST_INPUT | GUC_LIST_QUOTE,
+                              NULL, NULL, NULL);
+
+   /* XXX: make it changeable at SIGHUP? */
+   DefineCustomStringVariable("bdr.synchronous_commit",
+                              "bdr specific synchronous commit value",
+                              NULL,
+                              &bdr_synchronous_commit,
+                              NULL, PGC_POSTMASTER,
+                              0,
+                              NULL, NULL, NULL);
+
+   /* if nothing is configured, we're done */
+   if (connections == NULL)
+       goto out;
+
+   if (!SplitIdentifierString(connections, ',', &cons))
+   {
+       /* syntax error in list */
+       ereport(FATAL,
+               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                errmsg("invalid list syntax for \"bdr.connections\"")));
+   }
+
+   used_databases = malloc(sizeof(char *) * list_length(cons));
+
+   /* Common apply worker values */
+   apply_worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
+       BGWORKER_BACKEND_DATABASE_CONNECTION;
+   apply_worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   apply_worker.bgw_main = bdr_apply_main;
+   apply_worker.bgw_restart_time = 5;
+
+   /* Common sequence worker values */
+   sequencer_worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
+       BGWORKER_BACKEND_DATABASE_CONNECTION;
+   sequencer_worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   sequencer_worker.bgw_main = bdr_sequencer_main;
+   sequencer_worker.bgw_restart_time = 5;
+
+   foreach(c, cons)
+   {
+       const char *name = (char *) lfirst(c);
+       char       *errmsg = NULL;
+       PQconninfoOption *options;
+       PQconninfoOption *cur_option;
+
+       /* don't free, referenced by the guc machinery! */
+       char       *optname_dsn = palloc(strlen(name) + 30);
+       char       *optname_delay = palloc(strlen(name) + 30);
+       BDRWorkerCon *con;
+
+       found = false;
+
+       con = palloc(sizeof(BDRWorkerCon));
+       con->dsn = (char *) lfirst(c);
+       con->name = pstrdup(name);
+       con->apply_delay = 0;
+
+       sprintf(optname_dsn, "bdr.%s.dsn", name);
+       DefineCustomStringVariable(optname_dsn,
+                                  optname_dsn,
+                                  NULL,
+                                  &con->dsn,
+                                  NULL, PGC_POSTMASTER,
+                                  GUC_NOT_IN_SAMPLE,
+                                  NULL, NULL, NULL);
+
+       sprintf(optname_delay, "bdr.%s.apply_delay", name);
+       DefineCustomIntVariable(optname_delay,
+                               optname_delay,
+                               NULL,
+                               &con->apply_delay,
+                               0, 0, INT_MAX,
+                               PGC_SIGHUP,
+                               GUC_UNIT_MS,
+                               NULL, NULL, NULL);
+
+       if (!con->dsn)
+       {
+           elog(WARNING, "no connection information for %s", name);
+           continue;
+       }
+
+       elog(LOG, "bgworkers, connection: %s", con->dsn);
+
+       options = PQconninfoParse(con->dsn, &errmsg);
+       if (errmsg != NULL)
+       {
+           char       *str = pstrdup(errmsg);
+
+           PQfreemem(errmsg);
+           elog(ERROR, "msg: %s", str);
+       }
+
+       cur_option = options;
+       while (cur_option->keyword != NULL)
+       {
+           if (strcmp(cur_option->keyword, "dbname") == 0)
+           {
+               if (cur_option->val == NULL)
+                   elog(ERROR, "no dbname set");
+
+               con->dbname = pstrdup(cur_option->val);
+           }
+
+           if (cur_option->val != NULL)
+           {
+               elog(LOG, "option: %s, val: %s",
+                    cur_option->keyword, cur_option->val);
+           }
+           cur_option++;
+       }
+
+
+       snprintf(apply_worker.bgw_name, BGW_MAXLEN,
+                "bdr apply: %s", name);
+       apply_worker.bgw_main_arg = PointerGetDatum(con);
+
+       RegisterBackgroundWorker(&apply_worker);
+       nregistered++;
+
+       /* keep track of the databases used */
+       /* check whether we already have a connection in this db */
+       for (off = 0; off < num_used_databases; off++)
+       {
+           if (strcmp(con->dbname , used_databases[off]) == 0)
+           {
+               found = true;
+               break;
+           }
+       }
+
+       if (!found)
+       {
+           used_databases[num_used_databases++] = pstrdup(con->dbname);
+       }
+
+       /* cleanup */
+       PQconninfoFree(options);
+   }
+
+   Assert(num_used_databases <= nregistered);
+
+   /*
+    * start sequence coordination process if necessary. One process per
+    * database, *not* one per configured connection.
+    */
+   for (off = 0; off < num_used_databases; off++)
+   {
+       const char *name = used_databases[off];
+       BDRSequencerCon *con;
+
+       con = palloc(sizeof(BDRSequencerCon));
+       con->dbname = pstrdup(name);
+       con->num_nodes = nregistered;
+       con->slot = off;
+
+       elog(LOG, "starting seq on %s", name);
+
+       snprintf(sequencer_worker.bgw_name, BGW_MAXLEN,
+                "bdr sequencer: %s", name);
+       sequencer_worker.bgw_main_arg = PointerGetDatum(con);
+
+       RegisterBackgroundWorker(&sequencer_worker);
+
+   }
+
+   EmitWarningsOnPlaceholders("bdr");
+out:
+
+   /*
+    * initialize other modules that need shared memory
+    *
+    * Do so even if we haven't any remote nodes setup, the shared memory
+    * might still be needed for some sql callable functions or such.
+    */
+
+   /* register a slot for every remote node */
+   bdr_count_shmem_init(nregistered);
+   bdr_sequencer_shmem_init(nregistered, num_used_databases);
+
+   MemoryContextSwitchTo(old_context);
+}
diff --git a/contrib/bdr/bdr.conf b/contrib/bdr/bdr.conf
new file mode 100644 (file)
index 0000000..eb8bc21
--- /dev/null
@@ -0,0 +1,4 @@
+shared_preload_libraries = 'bdr'
+wal_level = logical
+max_logical_slots = 4
+track_commit_timestamp = on
diff --git a/contrib/bdr/bdr.control b/contrib/bdr/bdr.control
new file mode 100644 (file)
index 0000000..bd869a0
--- /dev/null
@@ -0,0 +1,7 @@
+# bdr extension
+comment = 'bdr support functions'
+default_version = '0.5'
+module_pathname = '$libdir/bdr'
+relocatable = false
+requires = btree_gist
+schema = pg_catalog
diff --git a/contrib/bdr/bdr.h b/contrib/bdr/bdr.h
new file mode 100644 (file)
index 0000000..595d1b4
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * bdr.h
+ *
+ * BiDirectionalReplication
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * contrib/bdr/bdr.h
+ */
+#ifndef BDR_H
+#define BDR_H
+
+#include "replication/logical.h"
+#include "access/xlogdefs.h"
+#include "utils/resowner.h"
+
+
+typedef struct BDRWorkerCon
+{
+   /* name specified in configuration */
+   char *name;
+
+   /* local & remote database name */
+   char *dbname;
+   /* dsn to connect to the remote database */
+   char *dsn;
+   /* how much do we want to delay apply, in ms */
+   int apply_delay;
+
+   RepNodeId origin_id;
+   uint64 sysid;
+   TimeLineID timeline;
+} BDRWorkerCon;
+
+typedef struct BDRSequencerCon
+{
+   /* local database name */
+   char *dbname;
+
+   size_t slot;
+
+   /* yuck */
+   size_t num_nodes;
+} BDRSequencerCon;
+
+extern ResourceOwner bdr_saved_resowner;
+extern BDRWorkerCon *bdr_apply_con;
+extern BDRSequencerCon *bdr_sequencer_con;
+
+/* apply support */
+extern void process_remote_begin(char *data, size_t r);
+extern void process_remote_commit(char *data, size_t r);
+extern void process_remote_insert(char *data, size_t r);
+extern void process_remote_update(char *data, size_t r);
+extern void process_remote_delete(char *data, size_t r);
+
+/* sequence support */
+extern void bdr_sequencer_shmem_init(int nnodes, int sequencers);
+extern void bdr_sequencer_init(void);
+extern void bdr_sequencer_vote(void);
+extern void bdr_sequencer_tally(void);
+extern void bdr_sequencer_start_elections(void);
+extern void bdr_sequencer_fill_sequences(void);
+
+extern void bdr_sequencer_wakeup(void);
+
+extern void bdr_sequence_alloc(PG_FUNCTION_ARGS);
+extern void bdr_sequence_setval(PG_FUNCTION_ARGS);
+extern Datum bdr_sequence_options(PG_FUNCTION_ARGS);
+
+/* statistic functions */
+extern void bdr_count_shmem_init(size_t nnodes);
+extern void bdr_count_set_current_node(RepNodeId node_id);
+extern void bdr_count_commit(void);
+extern void bdr_count_rollback(void);
+extern void bdr_count_insert(void);
+extern void bdr_count_insert_conflict(void);
+extern void bdr_count_update(void);
+extern void bdr_count_update_conflict(void);
+extern void bdr_count_delete(void);
+extern void bdr_count_delete_conflict(void);
+extern void bdr_count_disconnect(void);
+
+
+#endif /* BDR_H */
diff --git a/contrib/bdr/bdr_apply.c b/contrib/bdr/bdr_apply.c
new file mode 100644 (file)
index 0000000..b7ececf
--- /dev/null
@@ -0,0 +1,910 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr_apply.c
+ *     Replication!!!
+ *
+ * Replication???
+ *
+ * Copyright (C) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     contrib/bdr/bdr_apply.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+#include "pgstat.h"
+
+#include "access/committs.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "access/xact.h"
+
+#include "catalog/namespace.h"
+#include "catalog/pg_type.h"
+
+#include "executor/spi.h"
+#include "executor/executor.h"
+
+#include "parser/parse_relation.h"
+
+#include "replication/logical.h"
+#include "replication/replication_identifier.h"
+
+#include "storage/bufmgr.h"
+
+#include "utils/builtins.h"
+#include "utils/datetime.h"
+#include "utils/lsyscache.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/tqual.h"
+
+static void build_scan_key(ScanKey skey, Relation rel, Relation idx_rel, HeapTuple key);
+static bool find_pkey_tuple(ScanKey skey, Relation rel, Relation idx_rel, ItemPointer tid, bool lock);
+static void UserTableUpdateIndexes(Relation rel, HeapTuple tuple);
+static char *read_tuple(char *data, size_t len, HeapTuple tuple, Oid *reloid);
+static void tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple);
+
+static void check_sequencer_wakeup(Relation rel);
+
+bool request_sequencer_wakeup = false;
+
+void
+process_remote_begin(char *data, size_t r)
+{
+   XLogRecPtr *origlsn;
+   TimestampTz *committime;
+   TimestampTz current;
+   char        statbuf[100];
+   Assert(bdr_apply_con != NULL);
+
+   origlsn = (XLogRecPtr *) data;
+   data += sizeof(XLogRecPtr);
+
+   committime = (TimestampTz *) data;
+   data += sizeof(TimestampTz);
+
+   /* setup state for commit and conflict detection */
+   replication_origin_lsn = *origlsn;
+   replication_origin_timestamp = *committime;
+
+   snprintf(statbuf, sizeof(statbuf),
+           "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %s, %X/%X, %s",
+            bdr_apply_con->name,
+           (uint32) (*origlsn >> 32), (uint32) *origlsn,
+           timestamptz_to_str(*committime));
+
+   elog(LOG, "%s", statbuf);
+
+   pgstat_report_activity(STATE_RUNNING, statbuf);
+
+   /* don't want the overhead otherwise */
+   if (bdr_apply_con->apply_delay > 0)
+   {
+       current = GetCurrentTimestamp();
+#ifndef HAVE_INT64_TIMESTAMP
+#error "we require integer timestamps"
+#endif
+       /* ensure no weirdness due to clock drift */
+       if (current > replication_origin_timestamp)
+       {
+           long        sec;
+           int         usec;
+
+           current = TimestampTzPlusMilliseconds(current, -bdr_apply_con->apply_delay);
+
+           TimestampDifference(current, replication_origin_timestamp,
+                               &sec, &usec);
+           /* FIXME: deal with overflow? */
+           pg_usleep(usec + (sec * USECS_PER_SEC));
+       }
+   }
+
+   request_sequencer_wakeup = false;
+
+   StartTransactionCommand();
+   PushActiveSnapshot(GetTransactionSnapshot());
+}
+
+void
+process_remote_commit(char *data, size_t r)
+{
+   XLogRecPtr *origlsn;
+   XLogRecPtr *end_lsn;
+   TimestampTz *committime;
+
+   origlsn = (XLogRecPtr *) data;
+   data += sizeof(XLogRecPtr);
+
+   end_lsn = (XLogRecPtr *) data;
+   data += sizeof(XLogRecPtr);
+
+   committime = (TimestampTz *) data;
+   data += sizeof(TimestampTz);
+
+   elog(LOG, "COMMIT origin(lsn, end, timestamp): %X/%X, %X/%X, %s",
+        (uint32) (*origlsn >> 32), (uint32) *origlsn,
+        (uint32) (*end_lsn >> 32), (uint32) *end_lsn,
+        timestamptz_to_str(*committime));
+
+   Assert(*origlsn == replication_origin_lsn);
+   Assert(*committime == replication_origin_timestamp);
+
+   PopActiveSnapshot();
+   CommitTransactionCommand();
+
+   pgstat_report_activity(STATE_IDLE, NULL);
+
+   AdvanceCachedReplicationIdentifier(*end_lsn, XactLastCommitEnd);
+
+   CurrentResourceOwner = bdr_saved_resowner;
+
+   bdr_count_commit();
+
+   if (request_sequencer_wakeup)
+   {
+       request_sequencer_wakeup = false;
+       bdr_sequencer_wakeup();
+   }
+}
+
+
+void
+process_remote_insert(char *data, size_t r)
+{
+#ifdef VERBOSE_INSERT
+   StringInfoData s;
+#endif
+   char        action;
+   HeapTupleData tup;
+   Oid         reloid;
+   Relation    rel;
+
+   action = data[0];
+   data++;
+
+   if (action != 'N')
+       elog(ERROR, "expected new tuple but got %c",
+            action);
+
+   data = read_tuple(data, r, &tup, &reloid);
+
+   rel = heap_open(reloid, RowExclusiveLock);
+
+   if (rel->rd_rel->relkind != RELKIND_RELATION)
+       elog(ERROR, "unexpected relkind '%c' rel \"%s\"",
+            rel->rd_rel->relkind, RelationGetRelationName(rel));
+
+   simple_heap_insert(rel, &tup);
+   UserTableUpdateIndexes(rel, &tup);
+   bdr_count_insert();
+
+   check_sequencer_wakeup(rel);
+
+   /* debug output */
+#if VERBOSE_INSERT
+   initStringInfo(&s);
+   tuple_to_stringinfo(&s, RelationGetDescr(rel), &tup);
+   elog(LOG, "INSERT: %s", s.data);
+   resetStringInfo(&s);
+#endif
+
+   heap_close(rel, NoLock);
+}
+
+static void
+fetch_sysid_via_node_id(RepNodeId node_id, uint64 *sysid, TimeLineID *tli)
+{
+   if (node_id == InvalidRepNodeId)
+   {
+       *sysid = GetSystemIdentifier();
+       *tli = ThisTimeLineID;
+   }
+   else
+   {
+       HeapTuple   node;
+       Form_pg_replication_identifier node_class;
+       char *ident;
+
+       uint64 remote_sysid;
+       Oid remote_dboid;
+       TimeLineID remote_tli;
+       Oid local_dboid;
+       NameData replication_name;
+
+       node = GetReplicationInfoByIdentifier(node_id, false);
+
+       node_class = (Form_pg_replication_identifier) GETSTRUCT(node);
+
+       ident = text_to_cstring(&node_class->riname);
+
+       if (sscanf(ident, "bdr: "UINT64_FORMAT"-%u-%u-%u:%s",
+                  &remote_sysid, &remote_tli, &remote_dboid, &local_dboid, NameStr(replication_name)) != 4)
+           elog(ERROR, "could not parse sysid: %s", ident);
+       ReleaseSysCache(node);
+       pfree(ident);
+   }
+}
+
+void
+process_remote_update(char *data, size_t r)
+{
+   StringInfoData s_key;
+   char        action;
+   HeapTupleData old_key;
+   HeapTupleData new_tuple;
+   Oid         reloid;
+   Oid         idxoid;
+   Oid         keyoid = InvalidOid;
+   HeapTuple   generated_key = NULL;
+   ItemPointerData oldtid;
+   Relation    rel;
+   Relation    idxrel;
+   bool        found_old;
+   ScanKeyData skey[INDEX_MAX_KEYS];
+   bool        primary_key_changed = false;
+
+   action = data[0];
+   data++;
+
+   /* old key present, identifying key changed */
+   if (action == 'K')
+   {
+       data = read_tuple(data, r, &old_key, &keyoid);
+       action = data[0];
+       data++;
+       primary_key_changed = true;;
+   }
+   else if (action != 'N')
+       elog(ERROR, "expected action 'N' or 'K', got %c",
+            action);
+
+   /* check for new  tuple */
+   if (action != 'N')
+       elog(ERROR, "expected action 'N', got %c",
+            action);
+
+   /* read new tuple */
+   data = read_tuple(data, r, &new_tuple, &reloid);
+
+   /* collected all data, lookup table definition */
+   rel = heap_open(reloid, RowExclusiveLock);
+
+   if (rel->rd_rel->relkind != RELKIND_RELATION)
+       elog(ERROR, "unexpected relkind '%c' rel \"%s\"",
+            rel->rd_rel->relkind, RelationGetRelationName(rel));
+
+   /*
+    * Check which tuple we want to use for the pkey lookup.
+    */
+   if (primary_key_changed)
+   {
+       if (reloid != keyoid)
+           elog(ERROR, "mismatching key, tuple oids: %u %u", reloid, keyoid);
+   }
+   else
+   {
+       /* key hasn't changed, just use columns from the new tuple */
+       old_key = new_tuple;
+   }
+
+   /* lookup index to build scankey */
+   if (rel->rd_indexvalid == 0)
+       RelationGetIndexList(rel);
+   idxoid = rel->rd_replidindex;
+   if (!OidIsValid(idxoid))
+   {
+       elog(ERROR, "could not find primary key for table with oid %u",
+            RelationGetRelid(rel));
+       return;
+   }
+
+   /* open index, so we can build scan key for row */
+   idxrel = index_open(idxoid, RowExclusiveLock);
+
+   Assert(idxrel->rd_index->indisunique);
+
+   build_scan_key(skey, rel, idxrel, &old_key);
+
+   /* look for tuple identified by the (old) primary key */
+   found_old = find_pkey_tuple(skey, rel, idxrel, &oldtid, true);
+
+   if (found_old)
+   {
+       HeapTupleData oldtuple;
+       Buffer      buf;
+       bool        found;
+       TransactionId xmin;
+       TimestampTz ts;
+       RepNodeId   local_node_id;
+       bool        apply_update;
+       bool        log_update;
+
+       uint64      local_sysid,
+                   remote_sysid;
+       TimeLineID  local_tli,
+                   remote_tli;
+       CommitExtraData local_node_id_raw;
+
+       ItemPointerCopy(&oldtid, &oldtuple.t_self);
+
+       /* refetch tuple, check for old commit ts & origin */
+       found = heap_fetch(rel, SnapshotAny, &oldtuple, &buf, false, NULL);
+       if (!found)
+           elog(ERROR, "could not refetch tuple %u/%u, relation %u",
+                ItemPointerGetBlockNumber(&oldtid),
+                ItemPointerGetOffsetNumber(&oldtid),
+                RelationGetRelid(rel));
+       xmin = HeapTupleHeaderGetXmin(oldtuple.t_data);
+       ReleaseBuffer(buf);
+
+       /*
+        * We now need to determine whether to keep the original version of the
+        * row, or apply the update we received.  We use the last-update-wins
+        * strategy for this, except when the new update comes from the same
+        * node that originated the previous version of the tuple.
+        */
+       TransactionIdGetCommitTsData(xmin, &ts, &local_node_id_raw);
+       local_node_id = local_node_id_raw;
+
+       if (local_node_id == bdr_apply_con->origin_id)
+       {
+           /*
+            * If the row got updated twice within a single node, just apply
+            * the update with no conflict.  Don't warn/log either, regardless
+            * of the timing; that's just too common and valid since normal row
+            * level locking guarantees are met.
+            */
+           apply_update = true;
+           log_update = false;
+       }
+       else
+       {
+           int     cmp;
+
+           /*
+            * Decide what update wins based on transaction timestamp difference.
+            * The later transaction wins.  If the timestamps compare equal,
+            * use sysid + TLI to discern.
+            */
+
+           cmp = timestamptz_cmp_internal(replication_origin_timestamp, ts);
+
+           if (cmp > 0)
+           {
+               apply_update = true;
+               log_update = false;
+           }
+           else if (cmp == 0)
+           {
+               log_update = true;
+
+               fetch_sysid_via_node_id(local_node_id,
+                                       &local_sysid, &local_tli);
+               fetch_sysid_via_node_id(bdr_apply_con->origin_id,
+                                       &remote_sysid, &remote_tli);
+
+               if (local_sysid < remote_sysid)
+                   apply_update = true;
+               else if (local_sysid > remote_sysid)
+                   apply_update = false;
+               else if (local_tli < remote_tli)
+                   apply_update = true;
+               else if (local_tli > remote_tli)
+                   apply_update = false;
+               else
+                   /* shouldn't happen */
+                   elog(ERROR, "unsuccessful node comparison");
+           }
+           else
+           {
+               apply_update = false;
+               log_update = true;
+           }
+       }
+
+       if (log_update)
+       {
+           char        remote_ts[MAXDATELEN + 1];
+           char        local_ts[MAXDATELEN + 1];
+
+           fetch_sysid_via_node_id(local_node_id,
+                                   &local_sysid, &local_tli);
+           fetch_sysid_via_node_id(bdr_apply_con->origin_id,
+                                   &remote_sysid, &remote_tli);
+           Assert(remote_sysid == bdr_apply_con->sysid);
+           Assert(remote_tli == bdr_apply_con->timeline);
+
+           memcpy(remote_ts, timestamptz_to_str(replication_origin_timestamp),
+                  MAXDATELEN);
+           memcpy(local_ts, timestamptz_to_str(ts),
+                  MAXDATELEN);
+
+           initStringInfo(&s_key);
+           tuple_to_stringinfo(&s_key, RelationGetDescr(idxrel), &old_key);
+
+           ereport(LOG,
+                   (errcode(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION),
+                    errmsg("CONFLICT: %s remote update originating at node " UINT64_FORMAT ":%u at ts %s; row was previously updated at %s node " UINT64_FORMAT ":%u at ts %s. PKEY:%s",
+                           apply_update ? "applying" : "skipping",
+                           remote_sysid, remote_tli, remote_ts,
+                     local_node_id == InvalidRepNodeId ? "local" : "remote",
+                           local_sysid, local_tli, local_ts, s_key.data)));
+           resetStringInfo(&s_key);
+       }
+
+       if (apply_update)
+       {
+           simple_heap_update(rel, &oldtid, &new_tuple);
+           /* FIXME: HOT support */
+           UserTableUpdateIndexes(rel, &new_tuple);
+           bdr_count_update();
+       }
+       else
+           bdr_count_update_conflict();
+   }
+   else
+   {
+       initStringInfo(&s_key);
+       tuple_to_stringinfo(&s_key, RelationGetDescr(idxrel), &old_key);
+       bdr_count_update_conflict();
+
+       ereport(ERROR,
+               (errcode(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION),
+                errmsg("CONFLICT: could not find existing tuple for pkey %s", s_key.data)));
+       /* XXX dead code */
+       resetStringInfo(&s_key);
+       goto err;
+   }
+
+err:
+   if (!primary_key_changed && generated_key != NULL)
+       heap_freetuple(generated_key);
+
+   check_sequencer_wakeup(rel);
+
+   /* release locks upon commit */
+   index_close(idxrel, NoLock);
+   heap_close(rel, NoLock);
+}
+
+void
+process_remote_delete(char *data, size_t r)
+{
+#ifdef VERBOSE_DELETE
+   StringInfoData s;
+#endif
+   char        action;
+   Oid         reloid;
+   Oid         idxoid;
+   HeapTupleData old_key;
+   Relation    rel;
+   Relation    idxrel;
+   ScanKeyData skey[INDEX_MAX_KEYS];
+   bool        found_old;
+   ItemPointerData oldtid;
+
+   action = data[0];
+   data++;
+
+   if (action == 'E')
+   {
+       elog(WARNING, "got delete without pkey");
+       return;
+   }
+   else if (action != 'K')
+       elog(ERROR, "expected action K got %c", action);
+
+   data = read_tuple(data, r, &old_key, &reloid);
+
+   /* collected all data, lookup table definition */
+   rel = heap_open(reloid, RowExclusiveLock);
+
+   /* lookup index to build scankey */
+   if (rel->rd_indexvalid == 0)
+       RelationGetIndexList(rel);
+   idxoid = rel->rd_replidindex;
+   if (!OidIsValid(idxoid))
+   {
+       elog(ERROR, "could not find primary key for table with oid %u",
+            RelationGetRelid(rel));
+       return;
+   }
+
+   /* Now open the primary key index */
+   idxrel = index_open(idxoid, RowExclusiveLock);
+
+   if (rel->rd_rel->relkind != RELKIND_RELATION)
+       elog(ERROR, "unexpected relkind '%c' rel \"%s\"",
+            rel->rd_rel->relkind, RelationGetRelationName(rel));
+
+   build_scan_key(skey, rel, idxrel, &old_key);
+
+   /* try to find tuple via a (candidate|primary) key */
+   found_old = find_pkey_tuple(skey, rel, idxrel, &oldtid, true);
+
+   if (found_old)
+   {
+       simple_heap_delete(rel, &oldtid);
+       bdr_count_delete();
+
+   }
+   else
+   {
+       StringInfoData s_key;
+
+       bdr_count_delete_conflict();
+
+       initStringInfo(&s_key);
+       tuple_to_stringinfo(&s_key, RelationGetDescr(idxrel), &old_key);
+
+       ereport(ERROR,
+               (errcode(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION),
+                errmsg("CONFLICT: DELETE could not find existing tuple for pkey %s", s_key.data)));
+       resetStringInfo(&s_key);
+   }
+
+#if VERBOSE_DELETE
+   initStringInfo(&s);
+   tuple_to_stringinfo(&s, RelationGetDescr(idxrel), &old_key);
+   elog(LOG, "DELETE old-key: %s", s.data);
+   resetStringInfo(&s);
+#endif
+
+   check_sequencer_wakeup(rel);
+
+   index_close(idxrel, NoLock);
+   heap_close(rel, NoLock);
+}
+
+static void
+check_sequencer_wakeup(Relation rel)
+{
+   if (strcmp(RelationGetRelationName(rel), "bdr_sequence_values") == 0 ||
+       strcmp(RelationGetRelationName(rel), "bdr_sequence_elections") == 0 ||
+       strcmp(RelationGetRelationName(rel), "bdr_votes") == 0)
+       request_sequencer_wakeup = true;
+}
+
+/*
+ * Converts an int64 from network byte order to native format.
+ *
+ * FIXME: replace with pg_getmsgint64
+ */
+static int64
+recvint64(char *buf)
+{
+   int64       result;
+   uint32      h32;
+   uint32      l32;
+
+   memcpy(&h32, buf, 4);
+   memcpy(&l32, buf + 4, 4);
+   h32 = ntohl(h32);
+   l32 = ntohl(l32);
+
+   result = h32;
+   result <<= 32;
+   result |= l32;
+
+   return result;
+}
+
+/*
+ * Read a tuple specification from the given data of the given len, filling
+ * the HeapTuple with it.  Also, reloid is set to the OID of the relation
+ * that this tuple is related to.  (The passed data contains schema and
+ * relation names; they are resolved to the corresponding local OID.)
+ */
+static char *
+read_tuple(char *data, size_t len, HeapTuple tuple, Oid *reloid)
+{
+   int64       relnamelen;
+   char       *relname;
+   Oid         relid;
+   int64       nspnamelen;
+   char       *nspname;
+   int64       tuplelen;
+   Oid         nspoid;
+   char        t;
+
+   *reloid = InvalidOid;
+
+   /* FIXME: unaligned data accesses */
+   t = data[0];
+   data += 1;
+   if (t != 'T')
+       elog(ERROR, "expected TUPLE, got %c", t);
+
+   nspnamelen = recvint64(&data[0]);
+   data += 8;
+   nspname = data;
+   data += nspnamelen;
+
+   relnamelen = recvint64(&data[0]);
+   data += 8;
+   relname = data;
+   data += relnamelen;
+
+   tuplelen = recvint64(&data[0]);
+   data += 8;
+
+   tuple->t_data = (HeapTupleHeader) data;
+   tuple->t_len = tuplelen;
+   data += tuplelen;
+
+   /* resolve the names into a relation OID */
+   nspoid = get_namespace_oid(nspname, false);
+   relid = get_relname_relid(relname, nspoid);
+   if (relid == InvalidOid)
+       elog(ERROR, "could not resolve relation name %s.%s", nspname, relname);
+
+   *reloid = relid;
+
+   return data;
+}
+
+/* print the tuple 'tuple' into the StringInfo s */
+static void
+tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple)
+{
+   int         natt;
+   Oid         oid;
+
+   /* print oid of tuple, it's not included in the TupleDesc */
+   if ((oid = HeapTupleHeaderGetOid(tuple->t_data)) != InvalidOid)
+   {
+       appendStringInfo(s, " oid[oid]:%u", oid);
+   }
+
+   /* print all columns individually */
+   for (natt = 0; natt < tupdesc->natts; natt++)
+   {
+       Form_pg_attribute attr; /* the attribute itself */
+       Oid         typid;      /* type of current attribute */
+       HeapTuple   type_tuple; /* information about a type */
+       Form_pg_type type_form;
+       Oid         typoutput;  /* output function */
+       bool        typisvarlena;
+       Datum       origval;    /* possibly toasted Datum */
+       Datum       val;        /* definitely detoasted Datum */
+       char       *outputstr = NULL;
+       bool        isnull;     /* column is null? */
+
+       attr = tupdesc->attrs[natt];
+
+       /*
+        * don't print dropped columns, we can't be sure everything is
+        * available for them
+        */
+       if (attr->attisdropped)
+           continue;
+
+       /*
+        * Don't print system columns
+        */
+       if (attr->attnum < 0)
+           continue;
+
+       typid = attr->atttypid;
+
+       /* gather type name */
+       type_tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
+       if (!HeapTupleIsValid(type_tuple))
+           elog(ERROR, "cache lookup failed for type %u", typid);
+       type_form = (Form_pg_type) GETSTRUCT(type_tuple);
+
+       /* print attribute name */
+       appendStringInfoChar(s, ' ');
+       appendStringInfoString(s, NameStr(attr->attname));
+
+       /* print attribute type */
+       appendStringInfoChar(s, '[');
+       appendStringInfoString(s, NameStr(type_form->typname));
+       appendStringInfoChar(s, ']');
+
+       /* query output function */
+       getTypeOutputInfo(typid,
+                         &typoutput, &typisvarlena);
+
+       ReleaseSysCache(type_tuple);
+
+       /* get Datum from tuple */
+       origval = fastgetattr(tuple, natt + 1, tupdesc, &isnull);
+
+       if (isnull)
+           outputstr = "(null)";
+       else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval))
+           outputstr = "(unchanged-toast-datum)";
+       else if (typisvarlena)
+           val = PointerGetDatum(PG_DETOAST_DATUM(origval));
+       else
+           val = origval;
+
+       /* print data */
+       if (outputstr == NULL)
+           outputstr = OidOutputFunctionCall(typoutput, val);
+
+       appendStringInfoChar(s, ':');
+       appendStringInfoString(s, outputstr);
+   }
+}
+
+
+/*
+ * The state object used by CatalogOpenIndexes and friends is actually the
+ * same as the executor's ResultRelInfo, but we give it another type name
+ * to decouple callers from that fact.
+ */
+typedef struct ResultRelInfo *UserTableIndexState;
+
+static void
+UserTableUpdateIndexes(Relation rel, HeapTuple tuple)
+{
+   /* this is largely copied together from copy.c's CopyFrom */
+   EState     *estate = CreateExecutorState();
+   ResultRelInfo *resultRelInfo;
+   List       *recheckIndexes = NIL;
+   TupleDesc   tupleDesc = RelationGetDescr(rel);
+
+   resultRelInfo = makeNode(ResultRelInfo);
+   resultRelInfo->ri_RangeTableIndex = 1;      /* dummy */
+   resultRelInfo->ri_RelationDesc = rel;
+   resultRelInfo->ri_TrigInstrument = NULL;
+
+   ExecOpenIndices(resultRelInfo);
+
+   estate->es_result_relations = resultRelInfo;
+   estate->es_num_result_relations = 1;
+   estate->es_result_relation_info = resultRelInfo;
+
+   if (resultRelInfo->ri_NumIndices > 0)
+   {
+       TupleTableSlot *slot = ExecInitExtraTupleSlot(estate);
+
+       ExecSetSlotDescriptor(slot, tupleDesc);
+       ExecStoreTuple(tuple, slot, InvalidBuffer, false);
+
+       recheckIndexes = ExecInsertIndexTuples(slot, &tuple->t_self,
+                                              estate);
+   }
+
+   ExecResetTupleTable(estate->es_tupleTable, false);
+
+   ExecCloseIndices(resultRelInfo);
+
+   FreeExecutorState(estate);
+   /* FIXME: recheck the indexes */
+   list_free(recheckIndexes);
+}
+
+/*
+ * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that
+ * is setup to match 'rel' (*NOT* idxrel!).
+ */
+static void
+build_scan_key(ScanKey skey, Relation rel, Relation idxrel, HeapTuple key)
+{
+   int         attoff;
+   Datum       indclassDatum;
+   Datum       indkeyDatum;
+   bool        isnull;
+   oidvector  *opclass;
+   int2vector  *indkey;
+
+   indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
+                                   Anum_pg_index_indclass, &isnull);
+   Assert(!isnull);
+   opclass = (oidvector *) DatumGetPointer(indclassDatum);
+
+   indkeyDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
+                                   Anum_pg_index_indkey, &isnull);
+   Assert(!isnull);
+   indkey = (int2vector *) DatumGetPointer(indkeyDatum);
+
+
+   for (attoff = 0; attoff < RelationGetNumberOfAttributes(idxrel); attoff++)
+   {
+       Oid         operator;
+       Oid         opfamily;
+       RegProcedure regop;
+       int         pkattno = attoff + 1;
+       int         mainattno = indkey->values[attoff];
+       Oid         atttype = attnumTypeId(rel, mainattno);
+       Oid         optype = get_opclass_input_type(opclass->values[attoff]);
+
+       opfamily = get_opclass_family(opclass->values[attoff]);
+
+       operator = get_opfamily_member(opfamily, optype,
+                                      optype,
+                                      BTEqualStrategyNumber);
+
+       if (!OidIsValid(operator))
+           elog(ERROR, "could not lookup equality operator for type %u, optype %u in opfamily %u",
+                atttype, optype, opfamily);
+
+       regop = get_opcode(operator);
+
+       /* FIXME: convert type? */
+       ScanKeyInit(&skey[attoff],
+                   pkattno,
+                   BTEqualStrategyNumber,
+                   regop,
+                   fastgetattr(key, mainattno,
+                               RelationGetDescr(idxrel), &isnull));
+       if (isnull)
+           elog(ERROR, "index tuple with a null column");
+   }
+}
+
+/*
+ * Search the index 'idxrel' for a tuple identified by 'skey' in 'rel'.
+ *
+ * If a matching tuple is found setup 'tid' to point to it and return true,
+ * false is returned otherwise.
+ */
+static bool
+find_pkey_tuple(ScanKey skey, Relation rel, Relation idxrel,
+               ItemPointer tid, bool lock)
+{
+   HeapTuple   tuple;
+   bool        found = false;
+   IndexScanDesc scan;
+   Snapshot snap = GetActiveSnapshot();
+
+   /*
+    * XXX: should we use a different snapshot here to be able to get more
+    * information about concurrent activity? For now we use a snapshot
+    * isolation snapshot...
+    */
+
+   scan = index_beginscan(rel, idxrel,
+                          snap,
+                          RelationGetNumberOfAttributes(idxrel),
+                          0);
+   index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0);
+
+   while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL)
+   {
+       if (found)
+           elog(ERROR, "WTF, more than one tuple found via pk???");
+       found = true;
+       ItemPointerCopy(&tuple->t_self, tid);
+   }
+
+   index_endscan(scan);
+
+   if (lock && found)
+   {
+       Buffer buf;
+       HeapUpdateFailureData hufd;
+       HTSU_Result res;
+       HeapTupleData locktup;
+       ItemPointerCopy(tid, &locktup.t_self);
+
+       res = heap_lock_tuple(rel, &locktup, snap->curcid, LockTupleExclusive,
+                             false /* wait */,
+                             false /* don't follow updates */,
+                             &buf, &hufd);
+       switch (res)
+       {
+           case HeapTupleMayBeUpdated:
+               break;
+           case HeapTupleUpdated:
+               /* XXX: Improve handling here */
+               ereport(ERROR,
+                       (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+                        errmsg("concurrent update, retrying")));
+           default:
+               elog(ERROR, "unexpected HTSU_Result after locking: %u", res);
+               break;
+       }
+       ReleaseBuffer(buf);
+   }
+   return found;
+}
diff --git a/contrib/bdr/bdr_count.c b/contrib/bdr/bdr_count.c
new file mode 100644 (file)
index 0000000..d3038f4
--- /dev/null
@@ -0,0 +1,539 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr_count.c
+ *     Replication replication stats
+ *
+ * Copyright (C) 2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     contrib/bdr/bdr_count.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "bdr.h"
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "nodes/execnodes.h"
+
+#include "replication/replication_identifier.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/spin.h"
+#include "storage/lwlock.h"
+
+#include "utils/builtins.h"
+#include "utils/syscache.h"
+
+/*
+ * Statistics about logical replication
+ *
+ * whenever this struct is changed, bdr_count_version needs to be increased so
+ * on-disk values aren't reused
+ */
+typedef struct BdrCountSlot
+{
+   RepNodeId   node_id;
+
+   /* we use int64 to make sure we can export to sql, there is uint64 there */
+   int64       nr_commit;
+   int64       nr_rollback;
+
+   int64       nr_insert;
+   int64       nr_insert_conflict;
+   int64       nr_update;
+   int64       nr_update_conflict;
+   int64       nr_delete;
+   int64       nr_delete_conflict;
+
+   int64       nr_disconnect;
+}  BdrCountSlot;
+
+/*
+ * Shared memory header for the stats module.
+ */
+typedef struct BdrCountControl
+{
+   LWLockId    lock;
+   BdrCountSlot slots[FLEXIBLE_ARRAY_MEMBER];
+}  BdrCountControl;
+
+/*
+ * Header of a stats disk serialization, used to detect old files, changed
+ * parameters and such.
+ */
+typedef struct BdrCountSerialize
+{
+   uint32      magic;
+   uint32      version;
+   uint32      nr_slots;
+}  BdrCountSerialize;
+
+/* magic number of the stats file, don't change */
+static const uint32 bdr_count_magic = 0x5e51A7;
+
+/* everytime the stored data format changes, increase */
+static const uint32 bdr_count_version = 2;
+
+/* shortcut for the finding BdrCountControl in memory */
+static BdrCountControl *BdrCountCtl = NULL;
+
+/* how many nodes have we built shmem for */
+static size_t bdr_count_nnodes = 0;
+
+/* offset in the BdrCountControl->slots "our" backend is in */
+static int MyCountOffsetIdx = -1;
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+static void bdr_count_shmem_startup(void);
+static void bdr_count_shmem_shutdown(int code, Datum arg);
+static Size bdr_count_shmem_size(void);
+
+static void bdr_count_serialize(void);
+static void bdr_count_unserialize(void);
+
+#define BDR_COUNT_STAT_COLS 12
+
+extern Datum pg_stat_get_bdr(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(pg_stat_bdr);
+
+static Size
+bdr_count_shmem_size(void)
+{
+   Size        size = 0;
+
+   size = add_size(size, sizeof(BdrCountControl));
+   size = add_size(size, mul_size(bdr_count_nnodes, sizeof(BdrCountSlot)));
+
+   return size;
+}
+
+void
+bdr_count_shmem_init(size_t nnodes)
+{
+   Assert(process_shared_preload_libraries_in_progress);
+
+   bdr_count_nnodes = nnodes;
+
+   RequestAddinShmemSpace(bdr_count_shmem_size());
+   /* lock for slot acquiration */
+   RequestAddinLWLocks(1);
+
+   prev_shmem_startup_hook = shmem_startup_hook;
+   shmem_startup_hook = bdr_count_shmem_startup;
+}
+
+static void
+bdr_count_shmem_startup(void)
+{
+   bool        found;
+
+   if (prev_shmem_startup_hook)
+       prev_shmem_startup_hook();
+
+   LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+   BdrCountCtl = ShmemInitStruct("bdr_count",
+                                 bdr_count_shmem_size(),
+                                 &found);
+   if (!found)
+   {
+       /* initialize */
+
+       memset(BdrCountCtl, 0, bdr_count_shmem_size());
+       BdrCountCtl->lock = LWLockAssign();
+       bdr_count_unserialize();
+   }
+   LWLockRelease(AddinShmemInitLock);
+
+   /*
+    * If we're in the postmaster (or a standalone backend...), set up a shmem
+    * exit hook to dump the statistics to disk.
+    */
+   if (!IsUnderPostmaster)
+       on_shmem_exit(bdr_count_shmem_shutdown, (Datum) 0);
+}
+
+static void
+bdr_count_shmem_shutdown(int code, Datum arg)
+{
+   /*
+    * To avoid doing the same everywhere, we only write in postmaster itself
+    * (or in a single node postgres)
+    */
+   if (IsUnderPostmaster)
+       return;
+
+   /* persist the file */
+   bdr_count_serialize();
+}
+
+/*
+ * Find a statistics slot for a given RepNodeId and setup a local variable
+ * pointing to it so we can quickly find it for the actual statistics
+ * manipulation.
+ */
+void
+bdr_count_set_current_node(RepNodeId node_id)
+{
+   size_t      i;
+
+   MyCountOffsetIdx = -1;
+
+   LWLockAcquire(BdrCountCtl->lock, LW_EXCLUSIVE);
+
+   /* check whether stats already are counted for this node */
+   for (i = 0; i < bdr_count_nnodes; i++)
+   {
+       if (BdrCountCtl->slots[i].node_id == node_id)
+       {
+           MyCountOffsetIdx = i;
+           break;
+       }
+   }
+
+   if (MyCountOffsetIdx != -1)
+       goto out;
+
+   /* ok, get a new slot */
+   for (i = 0; i < bdr_count_nnodes; i++)
+   {
+       if (BdrCountCtl->slots[i].node_id == InvalidRepNodeId)
+       {
+           MyCountOffsetIdx = i;
+           BdrCountCtl->slots[i].node_id = node_id;
+           break;
+       }
+   }
+
+   if (MyCountOffsetIdx == -1)
+       elog(PANIC, "could not find a bdr count slot for %u", node_id);
+out:
+   LWLockRelease(BdrCountCtl->lock);
+}
+
+/*
+ * Statistic manipulation functions.
+ *
+ * We assume we don't have to do any locking for *our* slot since only one
+ * backend will do writing there.
+ */
+void
+bdr_count_commit(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_commit++;
+}
+
+void
+bdr_count_rollback(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_rollback++;
+}
+
+void
+bdr_count_insert(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_insert++;
+}
+
+void
+bdr_count_insert_conflict(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_insert_conflict++;
+}
+
+void
+bdr_count_update(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_update++;
+}
+
+void
+bdr_count_update_conflict(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_update_conflict++;
+}
+
+void
+bdr_count_delete(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_delete++;
+}
+
+void
+bdr_count_delete_conflict(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_delete_conflict++;
+}
+
+void
+bdr_count_disconnect(void)
+{
+   Assert(MyCountOffsetIdx != -1);
+   BdrCountCtl->slots[MyCountOffsetIdx].nr_disconnect++;
+}
+
+Datum
+pg_stat_get_bdr(PG_FUNCTION_ARGS)
+{
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+   TupleDesc   tupdesc;
+   Tuplestorestate *tupstore;
+   MemoryContext per_query_ctx;
+   MemoryContext oldcontext;
+   int         current_offset;
+
+   if (!superuser())
+       elog(ERROR, "blarg");
+
+   if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("set-valued function called in context that cannot accept a set")));
+   if (!(rsinfo->allowedModes & SFRM_Materialize))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("materialize mode required, but it is not allowed in this context")));
+   if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+       elog(ERROR, "return type must be a row type");
+
+   if (tupdesc->natts != BDR_COUNT_STAT_COLS)
+       elog(ERROR, "wrong function definition");
+
+   per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+   oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+   tupstore = tuplestore_begin_heap(true, false, work_mem);
+   rsinfo->returnMode = SFRM_Materialize;
+   rsinfo->setResult = tupstore;
+   rsinfo->setDesc = tupdesc;
+
+   MemoryContextSwitchTo(oldcontext);
+
+   /* don't let a node get created/vanish below us */
+   LWLockAcquire(BdrCountCtl->lock, LW_SHARED);
+
+   for (current_offset = 0; current_offset < bdr_count_nnodes;
+        current_offset++)
+   {
+       HeapTuple   repTup;
+       Form_pg_replication_identifier repClass;
+       BdrCountSlot *slot;
+       Datum       values[BDR_COUNT_STAT_COLS];
+       bool        nulls[BDR_COUNT_STAT_COLS];
+
+       slot = &BdrCountCtl->slots[current_offset];
+
+       /* no stats here */
+       if (slot->node_id == InvalidRepNodeId)
+           continue;
+
+       memset(values, 0, sizeof(values));
+       memset(nulls, 0, sizeof(nulls));
+
+       repTup = GetReplicationInfoByIdentifier(slot->node_id, false);
+
+       repClass = (Form_pg_replication_identifier) GETSTRUCT(repTup);
+
+       values[ 0] = ObjectIdGetDatum(slot->node_id);
+       values[ 1] = ObjectIdGetDatum(repClass->riident);
+       values[ 2] = PointerGetDatum(&repClass->riname);
+       values[ 3] = Int64GetDatumFast(slot->nr_commit);
+       values[ 4] = Int64GetDatumFast(slot->nr_rollback);
+       values[ 5] = Int64GetDatumFast(slot->nr_insert);
+       values[ 6] = Int64GetDatumFast(slot->nr_insert_conflict);
+       values[ 7] = Int64GetDatumFast(slot->nr_update);
+       values[ 8] = Int64GetDatumFast(slot->nr_update_conflict);
+       values[ 9] = Int64GetDatumFast(slot->nr_delete);
+       values[10] = Int64GetDatumFast(slot->nr_delete_conflict);
+       values[11] = Int64GetDatumFast(slot->nr_disconnect);
+
+       tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+       ReleaseSysCache(repTup);
+   }
+   LWLockRelease(BdrCountCtl->lock);
+
+   tuplestore_donestoring(tupstore);
+
+   return (Datum) 0;
+}
+
+/*
+ * Write the BDR stats from shared memory to a file
+ */
+static void
+bdr_count_serialize(void)
+{
+   int         fd;
+   const char *tpath = "global/bdr.stat.tmp";
+   const char *path = "global/bdr.stat";
+   BdrCountSerialize serial;
+   Size        write_size;
+
+   LWLockAcquire(BdrCountCtl->lock, LW_EXCLUSIVE);
+
+   if (unlink(tpath) < 0 && errno != ENOENT)
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not unlink \"%s\": %m", tpath)));
+
+   fd = OpenTransientFile((char *) tpath,
+                          O_WRONLY | O_CREAT | O_EXCL | PG_BINARY,
+                          S_IRUSR | S_IWUSR);
+   if (fd < 0)
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not open \"%s\": %m", tpath)));
+
+   serial.magic = bdr_count_magic;
+   serial.version = bdr_count_version;
+   serial.nr_slots = bdr_count_nnodes;
+
+   /* write header */
+   write_size = sizeof(serial);
+   if ((write(fd, &serial, write_size)) != write_size)
+   {
+       int     save_errno = errno;
+
+       CloseTransientFile(fd);
+       errno = save_errno;
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not write bdr stat file data \"%s\": %m",
+                       tpath)));
+   }
+
+   /* write data */
+   write_size = sizeof(BdrCountSlot) * bdr_count_nnodes;
+   if ((write(fd, &BdrCountCtl->slots, write_size)) != write_size)
+   {
+       int     save_errno = errno;
+
+       CloseTransientFile(fd);
+       errno = save_errno;
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not write bdr stat file data \"%s\": %m",
+                       tpath)));
+   }
+
+   CloseTransientFile(fd);
+
+   /* rename into place */
+   if (rename(tpath, path) != 0)
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not rename bdr stat file \"%s\" to \"%s\": %m",
+                       tpath, path)));
+   LWLockRelease(BdrCountCtl->lock);
+}
+
+/*
+ * Load BDR stats from file into shared memory
+ */
+static void
+bdr_count_unserialize(void)
+{
+   int         fd;
+   const char *path = "global/bdr.stat";
+   BdrCountSerialize serial;
+   Size        read_size;
+
+   if (BdrCountCtl == NULL)
+       elog(ERROR, "cannot use bdr statistics function without loading bdr");
+
+   LWLockAcquire(BdrCountCtl->lock, LW_EXCLUSIVE);
+
+   fd = OpenTransientFile((char *) path,
+                          O_RDONLY | PG_BINARY, 0);
+   if (fd < 0 && errno == ENOENT)
+       goto out;
+
+   if (fd < 0)
+   {
+       LWLockRelease(BdrCountCtl->lock);
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not open bdr stat file \"%s\": %m", path)));
+   }
+
+   read_size = sizeof(serial);
+   if (read(fd, &serial, read_size) != read_size)
+   {
+       int saved_errno = errno;
+       LWLockRelease(BdrCountCtl->lock);
+       CloseTransientFile(fd);
+       errno = saved_errno;
+       ereport(PANIC,
+               (errcode_for_file_access(),
+                errmsg("could not read bdr stat file data \"%s\": %m",
+                       path)));
+   }
+
+   if (serial.magic != bdr_count_magic)
+   {
+       LWLockRelease(BdrCountCtl->lock);
+       CloseTransientFile(fd);
+       elog(ERROR, "expected magic %u doesn't match read magic %u",
+            bdr_count_magic, serial.magic);
+   }
+
+   if (serial.version != bdr_count_version)
+   {
+       elog(WARNING, "version of stat file changed (file %u, current %u), zeroing",
+            serial.version, bdr_count_version);
+       goto zero_file;
+   }
+
+   if (serial.nr_slots > bdr_count_nnodes)
+   {
+       elog(WARNING, "stat file has more stats than we need, zeroing");
+       goto zero_file;
+   }
+
+   /* read actual data, directly into shmem */
+   read_size = sizeof(BdrCountSlot) * serial.nr_slots;
+   if (read(fd, &BdrCountCtl->slots, read_size) != read_size)
+   {
+       int saved_errno = errno;
+       CloseTransientFile(fd);
+       errno = saved_errno;
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not read bdr stat file data \"%s\": %m",
+                       path)));
+   }
+
+out:
+   if (fd >= 0)
+       CloseTransientFile(fd);
+   LWLockRelease(BdrCountCtl->lock);
+   return;
+
+zero_file:
+   CloseTransientFile(fd);
+   LWLockRelease(BdrCountCtl->lock);
+
+   /*
+    * Overwrite the existing file.  Note our struct was zeroed in
+    * bdr_count_shmem_startup, so we're writing empty data.
+    */
+   bdr_count_serialize();
+}
diff --git a/contrib/bdr/bdr_count.sql b/contrib/bdr/bdr_count.sql
new file mode 100644 (file)
index 0000000..263cf95
--- /dev/null
@@ -0,0 +1,22 @@
+CREATE FUNCTION pg_stat_bdr(
+    OUT rep_node_id oid,
+    OUT riremotesysid name,
+    OUT riremotedb oid,
+    OUT rilocaldb oid,
+    OUT nr_commit int8,
+    OUT nr_rollback int8,
+    OUT nr_insert int8,
+    OUT nr_insert_conflict int8,
+    OUT nr_update int8,
+    OUT nr_update_conflict int8,
+    OUT nr_delete int8,
+    OUT nr_delete_conflict int8,
+    OUT nr_disconnect int8
+)
+RETURNS SETOF record
+LANGUAGE C
+AS 'bdr.so'
+
+CREATE VIEW pg_stat_bdr AS SELECT * FROM pg_stat_bdr;
+REVOKE ALL ON FUNCTION pg_stat_bdr() FROM PUBLIC;
+
diff --git a/contrib/bdr/bdr_output.c b/contrib/bdr/bdr_output.c
new file mode 100644 (file)
index 0000000..9fb160c
--- /dev/null
@@ -0,0 +1,224 @@
+/*-------------------------------------------------------------------------
+ *
+ * bdr_output.c
+ *       BDR output plugin
+ *
+ * Copyright (c) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       contrib/bdr/bdr_output.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/sysattr.h"
+
+#include "catalog/pg_class.h"
+#include "catalog/pg_namespace.h"
+#include "catalog/pg_type.h"
+#include "catalog/index.h"
+
+#include "libpq/pqformat.h"
+
+#include "nodes/parsenodes.h"
+
+#include "replication/output_plugin.h"
+#include "replication/logical.h"
+
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+#include "utils/syscache.h"
+#include "utils/timestamp.h"
+#include "utils/typcache.h"
+
+PG_MODULE_MAGIC;
+
+extern void        _PG_init(void);
+extern void        _PG_output_plugin_init(OutputPluginCallbacks *cb);
+
+typedef struct
+{
+   MemoryContext context;
+   bool        include_xids;
+} TestDecodingData;
+
+/* These must be available to pg_dlsym() */
+static void pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt,
+                             bool is_init);
+static void pg_decode_begin_txn(LogicalDecodingContext *ctx,
+                   ReorderBufferTXN *txn);
+static void pg_decode_commit_txn(LogicalDecodingContext *ctx,
+                    ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
+static void pg_decode_change(LogicalDecodingContext *ctx,
+                ReorderBufferTXN *txn, Relation rel,
+                ReorderBufferChange *change);
+
+/* private prototypes */
+static void write_tuple(StringInfo out, Relation rel, HeapTuple tuple);
+
+void
+_PG_init(void)
+{
+}
+
+/* specify output plugin callbacks */
+void
+_PG_output_plugin_init(OutputPluginCallbacks *cb)
+{
+   AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit);
+
+   cb->startup_cb = pg_decode_startup;
+   cb->begin_cb = pg_decode_begin_txn;
+   cb->change_cb = pg_decode_change;
+   cb->commit_cb = pg_decode_commit_txn;
+   cb->shutdown_cb = NULL;
+}
+
+
+/* initialize this plugin */
+static void
+pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool is_init)
+{
+   TestDecodingData *data;
+
+   data = palloc(sizeof(TestDecodingData));
+   data->context = AllocSetContextCreate(TopMemoryContext,
+                                         "bdr conversion context",
+                                         ALLOCSET_DEFAULT_MINSIZE,
+                                         ALLOCSET_DEFAULT_INITSIZE,
+                                         ALLOCSET_DEFAULT_MAXSIZE);
+
+   ctx->output_plugin_private = data;
+
+   opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT;
+}
+
+/* BEGIN callback */
+void
+pg_decode_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
+{
+#ifdef NOT_YET
+   TestDecodingData *data = ctx->output_plugin_private;
+#endif
+   AssertVariableIsOfType(&pg_decode_begin_txn, LogicalDecodeBeginCB);
+
+   if (txn->origin_id != InvalidRepNodeId)
+       return;
+
+   OutputPluginPrepareWrite(ctx, true);
+   appendStringInfoChar(ctx->out, 'B');        /* BEGIN */
+   appendBinaryStringInfo(ctx->out, (char *) &txn->final_lsn, sizeof(XLogRecPtr));
+   appendBinaryStringInfo(ctx->out, (char *) &txn->commit_time, sizeof(TimestampTz));
+   OutputPluginWrite(ctx, true);
+   return;
+}
+
+/* COMMIT callback */
+void
+pg_decode_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+                    XLogRecPtr commit_lsn)
+{
+#ifdef NOT_YET
+   TestDecodingData *data = ctx->output_plugin_private;
+#endif
+
+   if (txn->origin_id != InvalidRepNodeId)
+       return;
+
+   OutputPluginPrepareWrite(ctx, true);
+   appendStringInfoChar(ctx->out, 'C');        /* sending COMMIT */
+   appendBinaryStringInfo(ctx->out, (char *) &commit_lsn, sizeof(XLogRecPtr));
+   appendBinaryStringInfo(ctx->out, (char *) &txn->end_lsn, sizeof(XLogRecPtr));
+   appendBinaryStringInfo(ctx->out, (char *) &txn->commit_time, sizeof(TimestampTz));
+   OutputPluginWrite(ctx, true);
+}
+
+void
+pg_decode_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+                Relation relation, ReorderBufferChange *change)
+{
+   TestDecodingData *data;
+   MemoryContext old;
+
+   data = ctx->output_plugin_private;
+
+   /* Avoid leaking memory by using and resetting our own context */
+   old = MemoryContextSwitchTo(data->context);
+
+   /* only log changes originating locally */
+   if (txn->origin_id != InvalidRepNodeId)
+       return;
+
+   OutputPluginPrepareWrite(ctx, true);
+
+   switch (change->action)
+   {
+       case REORDER_BUFFER_CHANGE_INSERT:
+           appendStringInfoChar(ctx->out, 'I');        /* action INSERT */
+           appendStringInfoChar(ctx->out, 'N');        /* new tuple follows */
+           write_tuple(ctx->out, relation, &change->tp.newtuple->tuple);
+           break;
+       case REORDER_BUFFER_CHANGE_UPDATE:
+           appendStringInfoChar(ctx->out, 'U');        /* action UPDATE */
+           if (change->tp.oldtuple != NULL)
+           {
+               appendStringInfoChar(ctx->out, 'K');    /* old key follows */
+               write_tuple(ctx->out, relation, &change->tp.oldtuple->tuple);
+           }
+           appendStringInfoChar(ctx->out, 'N');        /* new tuple follows */
+           write_tuple(ctx->out, relation, &change->tp.newtuple->tuple);
+           break;
+       case REORDER_BUFFER_CHANGE_DELETE:
+           appendStringInfoChar(ctx->out, 'D');        /* action DELETE */
+           if (change->tp.oldtuple != NULL)
+           {
+               appendStringInfoChar(ctx->out, 'K');    /* old key follows */
+               write_tuple(ctx->out, relation, &change->tp.oldtuple->tuple);
+           }
+           else
+               appendStringInfoChar(ctx->out, 'E');    /* empty */
+           break;
+   }
+   OutputPluginWrite(ctx, true);
+
+   MemoryContextSwitchTo(old);
+   MemoryContextReset(data->context);
+}
+
+static void
+write_tuple(StringInfo out, Relation rel, HeapTuple tuple)
+{
+   HeapTuple   cache;
+   Form_pg_namespace classNsp;
+   const char *nspname;
+   int64       nspnamelen;
+   const char *relname;
+   int64       relnamelen;
+
+   cache = SearchSysCache1(NAMESPACEOID,
+                           ObjectIdGetDatum(rel->rd_rel->relnamespace));
+   if (!HeapTupleIsValid(cache))
+       elog(ERROR, "cache lookup failed for namespace %u",
+            rel->rd_rel->relnamespace);
+   classNsp = (Form_pg_namespace) GETSTRUCT(cache);
+   nspname = pstrdup(NameStr(classNsp->nspname));
+   nspnamelen = strlen(nspname) + 1;
+   ReleaseSysCache(cache);
+
+   relname = NameStr(rel->rd_rel->relname);
+   relnamelen = strlen(relname) + 1;
+
+   appendStringInfoChar(out, 'T');     /* tuple follows */
+
+   pq_sendint64(out, nspnamelen);      /* schema name length */
+   appendBinaryStringInfo(out, nspname, nspnamelen);
+
+   pq_sendint64(out, relnamelen);      /* table name length */
+   appendBinaryStringInfo(out, relname, relnamelen);
+
+   pq_sendint64(out, tuple->t_len);    /* tuple length */
+   appendBinaryStringInfo(out, (char *) tuple->t_data, tuple->t_len);
+}
diff --git a/contrib/bdr/bdr_seq.c b/contrib/bdr/bdr_seq.c
new file mode 100644 (file)
index 0000000..d9e543c
--- /dev/null
@@ -0,0 +1,1209 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr.c
+ *     Replication!!!
+ *
+ * Replication???
+ *
+ * Copyright (C) 2012-2013, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     contrib/bdr/bdr.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+/* sequencer */
+#include "miscadmin.h"
+#include "pgstat.h"
+
+#include "access/reloptions.h"
+#include "access/transam.h"
+#include "access/seqam.h"
+#include "access/xact.h"
+#include "catalog/pg_type.h"
+#include "catalog/namespace.h"
+#include "commands/extension.h"
+#include "commands/sequence.h"
+#include "executor/spi.h"
+#include "utils/builtins.h"
+#include "utils/snapmgr.h"
+#include "utils/lsyscache.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+
+typedef struct BdrSequencerSlot
+{
+   Oid database_oid;
+   Latch *proclatch;
+} BdrSequencerSlot;
+
+typedef struct BdrSequencerControl
+{
+   size_t nnodes;
+   size_t slot;
+   BdrSequencerSlot slots[FLEXIBLE_ARRAY_MEMBER];
+} BdrSequencerControl;
+
+typedef struct BdrSequenceValues {
+   int64       start_value;
+   int64       next_value;
+   int64       end_value;
+} BdrSequenceValues;
+
+static BdrSequencerControl *BdrSequencerCtl = NULL;
+
+/* how many nodes have we built shmem for */
+static size_t bdr_seq_nnodes = 0;
+
+/* how many nodes have we built shmem for */
+static size_t bdr_seq_nsequencers = 0;
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+/* vote */
+const char* vote_sql =
+"INSERT INTO bdr_votes (\n"
+"    vote_sysid,\n"
+"    vote_tlid,\n"
+"    vote_dboid,\n"
+"    vote_riname,\n"
+"    vote_election_id,\n"
+"\n"
+"    voter_sysid,\n"
+"    voter_tlid,\n"
+"    voter_dboid,\n"
+"    voter_riname,\n"
+"\n"
+"    vote\n"
+")\n"
+"SELECT\n"
+"    owning_sysid, owning_tlid, owning_dboid, owning_riname, owning_election_id,\n"
+"    $1, $2, $3, $4,\n"
+"    -- haven't allocated the value ourselves\n"
+"    NOT EXISTS(\n"
+"        SELECT *\n"
+"        FROM bdr_sequence_values val\n"
+"        WHERE true\n"
+"            AND val.seqschema = election.seqschema\n"
+"            AND val.seqname = election.seqname\n"
+"            AND val.seqrange && election.seqrange\n"
+"            --AND NOT val.confirmed\n"
+"            AND val.owning_sysid = $1\n"
+"            AND val.owning_tlid = $2\n"
+"            AND val.owning_dboid = $3\n"
+"            AND val.owning_riname = $4\n"
+"    )\n"
+"    -- and we haven't allowed anybody else to use it\n"
+"    AND NOT EXISTS(\n"
+"        SELECT *\n"
+"        FROM bdr_votes vote\n"
+"            JOIN bdr_sequence_elections other_election ON (\n"
+"                other_election.owning_sysid = vote.vote_sysid\n"
+"                AND other_election.owning_tlid = vote.vote_tlid\n"
+"                AND other_election.owning_dboid = vote.vote_dboid\n"
+"                AND other_election.owning_riname = vote.vote_riname\n"
+"                AND other_election.owning_election_id = vote.vote_election_id\n"
+"            )\n"
+"        WHERE true\n"
+"            AND vote.voter_sysid = $1\n"
+"            AND vote.voter_tlid = $2\n"
+"            AND vote.voter_dboid = $3\n"
+"            AND vote.voter_riname = $4\n"
+"            AND other_election.seqname =  election.seqname\n"
+"            AND other_election.seqschema =  election.seqschema\n"
+"            AND other_election.seqrange && election.seqrange \n"
+"    )\n"
+"\n"
+"FROM bdr_sequence_elections election\n"
+"WHERE\n"
+"    election.open\n"
+"    -- not our election\n"
+"    AND NOT (\n"
+"        owning_sysid = $1\n"
+"        AND owning_tlid = $2\n"
+"        AND owning_dboid = $3\n"
+"        AND owning_riname = $4\n"
+"    )\n"
+"    -- we haven't voted about this yet\n"
+"    AND NOT EXISTS (\n"
+"        SELECT *\n"
+"        FROM bdr_votes\n"
+"        WHERE true\n"
+"            AND owning_sysid = vote_sysid\n"
+"            AND owning_tlid = vote_tlid\n"
+"            AND owning_dboid = vote_dboid\n"
+"            AND owning_riname = vote_riname\n"
+"            AND owning_election_id = vote_election_id\n"
+"\n"
+"            AND voter_sysid = $1\n"
+"            AND voter_tlid = $2\n"
+"            AND voter_dboid = $3\n"
+"            AND voter_riname = $4\n"
+"    )\n"
+"LIMIT 1\n"
+";";
+
+const char *start_elections_sql =
+"WITH to_be_updated_sequences AS (\n"
+"    SELECT\n"
+"        pg_namespace.nspname AS seqschema,\n"
+"        pg_class.relname AS seqname,\n"
+"        COUNT(bdr_sequence_values) AS open_seq_chunks,\n"
+"        COALESCE((\n"
+"            SELECT max(upper(seqrange))\n"
+"            FROM bdr_sequence_values max_val\n"
+"            WHERE\n"
+"                max_val.seqschema = pg_namespace.nspname\n"
+"                AND max_val.seqname = pg_class.relname\n"
+"        ), 0) AS current_max\n"
+"    FROM\n"
+"        pg_class\n"
+"        JOIN pg_namespace ON (pg_class.relnamespace = pg_namespace.oid)\n"
+"        LEFT JOIN bdr_sequence_values ON (\n"
+"            bdr_sequence_values.seqschema = pg_namespace.nspname\n"
+"            AND bdr_sequence_values.seqname = pg_class.relname\n"
+"            AND bdr_sequence_values.emptied = false\n"
+"            AND bdr_sequence_values.in_use = false\n"
+"            AND bdr_sequence_values.failed = false\n"
+"            AND bdr_sequence_values.owning_sysid = $1\n"
+"            AND bdr_sequence_values.owning_tlid = $2\n"
+"            AND bdr_sequence_values.owning_dboid = $3\n"
+"            AND bdr_sequence_values.owning_riname = $4\n"
+"        )\n"
+"    WHERE\n"
+"        pg_class.relkind = 'S'\n"
+"        AND pg_class.relam = (SELECT oid FROM pg_seqam WHERE seqamname = 'bdr')\n"
+"    GROUP BY\n"
+"        pg_class.relname,\n"
+"        pg_namespace.nspname\n"
+"    HAVING\n"
+"        count(bdr_sequence_values) <= 5\n"
+"),\n"
+"to_be_inserted_chunks AS (\n"
+"    SELECT\n"
+"        seqschema,\n"
+"        seqname,\n"
+"        current_max,\n"
+"        generate_series(\n"
+"            current_max,\n"
+"            -- 1000 is the chunk size, -1 is to get < instead <= out of generate_series\n"
+"            current_max + 1000 * (5 - open_seq_chunks) - 1,\n"
+"            1000) chunk_start\n"
+"    FROM to_be_updated_sequences\n"
+"),\n"
+"inserted_chunks AS (\n"
+"    INSERT INTO bdr_sequence_elections(\n"
+"        owning_sysid,\n"
+"        owning_tlid,\n"
+"        owning_dboid,\n"
+"        owning_riname,\n"
+"        owning_election_id,\n"
+"        vote_type,\n"
+"        open,\n"
+"        seqschema,\n"
+"        seqname,\n"
+"        seqrange\n"
+"    )\n"
+"    SELECT\n"
+"        $1,\n"
+"        $2,\n"
+"        $3,\n"
+"        $4,\n"
+"        (\n"
+"            SELECT COALESCE(max(owning_election_id), 0)\n"
+"            FROM bdr_sequence_elections biggest\n"
+"            WHERE\n"
+"               biggest.owning_sysid = $1\n"
+"               AND biggest.owning_tlid = $2\n"
+"               AND biggest.owning_dboid = $3\n"
+"               AND biggest.owning_riname = $4\n"
+"         ) + row_number() OVER (),\n"
+"        'sequence',\n"
+"        true AS open,\n"
+"        seqschema,\n"
+"        seqname,\n"
+"        int8range(chunk_start, chunk_start + 1000) AS seqrange\n"
+"    FROM to_be_inserted_chunks\n"
+"    RETURNING\n"
+"        seqschema,\n"
+"        seqname,\n"
+"        seqrange\n"
+")\n"
+"\n"
+"INSERT INTO bdr_sequence_values(\n"
+"    owning_sysid,\n"
+"    owning_tlid,\n"
+"    owning_dboid,\n"
+"    owning_riname,\n"
+"    seqschema,\n"
+"    seqname,\n"
+"    confirmed,\n"
+"    in_use,\n"
+"    emptied,\n"
+"    seqrange\n"
+")\n"
+"SELECT\n"
+"    $1,\n"
+"    $2,\n"
+"    $3,\n"
+"    $4,\n"
+"    seqschema,\n"
+"    seqname,\n"
+"    false AS confirmed,\n"
+"    false AS in_use,\n"
+"    false AS emptied,\n"
+"    int8range(chunk_start, chunk_start + 1000)\n"
+"FROM to_be_inserted_chunks\n"
+"-- force evaluation \n"
+"WHERE (SELECT count(*) FROM inserted_chunks) >= 0\n"
+"RETURNING\n"
+"    owning_sysid,\n"
+"    owning_tlid,\n"
+"    owning_dboid,\n"
+"    owning_riname,\n"
+"    seqschema,\n"
+"    seqname,\n"
+"    confirmed,\n"
+"    emptied,\n"
+"    seqrange\n"
+;
+
+const char *tally_elections_sql =
+"WITH tallied_votes AS (\n"
+"SELECT\n"
+"    election.owning_sysid,\n"
+"    election.owning_tlid,\n"
+"    election.owning_dboid,\n"
+"    election.owning_riname,\n"
+"    election.owning_election_id,\n"
+"    election.seqschema,\n"
+"    election.seqname,\n"
+"    election.seqrange,\n"
+"    SUM(COALESCE((vote.vote = true)::int, 0)) AS yays,\n"
+"    SUM(COALESCE((vote.vote = false)::int, 0)) AS nays,\n"
+"    COUNT(vote.vote) AS nr_votes,\n"
+"    /* majority of others */\n"
+"    COUNT(vote.vote) >= ceil($5/ 2.0) AS sufficient\n"
+"FROM\n"
+"    bdr_sequence_elections election\n"
+"    LEFT JOIN bdr_votes vote ON (\n"
+"            election.owning_sysid = vote.vote_sysid\n"
+"            AND election.owning_tlid = vote.vote_tlid\n"
+"            AND election.owning_dboid = vote.vote_dboid\n"
+"            AND election.owning_riname = vote.vote_riname\n"
+"            AND election.owning_election_id = vote.vote_election_id\n"
+"    )\n"
+"WHERE\n"
+"    election.open\n"
+"    AND election.owning_sysid = $1\n"
+"    AND election.owning_tlid = $2\n"
+"    AND election.owning_dboid = $3\n"
+"    AND election.owning_riname = $4\n"
+"GROUP BY\n"
+"    election.owning_sysid,\n"
+"    election.owning_tlid,\n"
+"    election.owning_dboid,\n"
+"    election.owning_riname,\n"
+"    election.owning_election_id,\n"
+"    election.seqschema,\n"
+"    election.seqname,\n"
+"    election.seqrange\n"
+"),\n"
+"cast_votes AS (\n"
+"    UPDATE bdr_sequence_elections\n"
+"    SET\n"
+"        open = false,\n"
+"        success = (nays = 0)\n"
+"    FROM tallied_votes\n"
+"    WHERE\n"
+"       bdr_sequence_elections.owning_sysid = tallied_votes.owning_sysid\n"
+"       AND bdr_sequence_elections.owning_tlid = tallied_votes.owning_tlid\n"
+"       AND bdr_sequence_elections.owning_dboid = tallied_votes.owning_dboid\n"
+"       AND bdr_sequence_elections.owning_riname = tallied_votes.owning_riname\n"
+"       AND bdr_sequence_elections.owning_election_id = tallied_votes.owning_election_id\n"
+"       AND tallied_votes.sufficient\n"
+"    RETURNING bdr_sequence_elections.*\n"
+"),\n"
+"successfull_sequence_values AS (\n"
+"    UPDATE bdr_sequence_values\n"
+"    SET\n"
+"        confirmed = true\n"
+"    FROM cast_votes\n"
+"    WHERE\n"
+"        cast_votes.success = true\n"
+"        AND bdr_sequence_values.seqschema = cast_votes.seqschema\n"
+"        AND bdr_sequence_values.seqname = cast_votes.seqname\n"
+"        AND bdr_sequence_values.seqrange = cast_votes.seqrange\n"
+"        AND bdr_sequence_values.owning_sysid = $1\n"
+"        AND bdr_sequence_values.owning_tlid = $2\n"
+"        AND bdr_sequence_values.owning_dboid = $3\n"
+"        AND bdr_sequence_values.owning_riname = $4\n"
+"    RETURNING bdr_sequence_values.*\n"
+"),\n"
+"failed_sequence_values AS (\n"
+"    UPDATE bdr_sequence_values\n"
+"    SET\n"
+"        failed = true\n"
+"    FROM cast_votes\n"
+"    WHERE\n"
+"        cast_votes.success = false\n"
+"        AND bdr_sequence_values.seqschema = cast_votes.seqschema\n"
+"        AND bdr_sequence_values.seqname = cast_votes.seqname\n"
+"        AND bdr_sequence_values.seqrange = cast_votes.seqrange\n"
+"        AND bdr_sequence_values.owning_sysid = $1\n"
+"        AND bdr_sequence_values.owning_tlid = $2\n"
+"        AND bdr_sequence_values.owning_dboid = $3\n"
+"        AND bdr_sequence_values.owning_riname = $4\n"
+"    RETURNING bdr_sequence_values.*\n"
+")\n"
+"\n"
+"SELECT\n"
+"    seqschema,\n"
+"    seqname,\n"
+"    seqrange,\n"
+"    'success'::text\n"
+"FROM successfull_sequence_values\n"
+"\n"
+"UNION ALL\n"
+"\n"
+"SELECT\n"
+"    seqschema,\n"
+"    seqname,\n"
+"    seqrange,\n"
+"    'failed'::text\n"
+"FROM failed_sequence_values\n"
+"\n"
+"UNION ALL\n"
+"\n"
+"SELECT\n"
+"    seqschema,\n"
+"    seqname,\n"
+"    seqrange,\n"
+"    'pending'::text\n"
+"FROM tallied_votes\n"
+"WHERE NOT sufficient\n"
+;
+
+const char *fill_sequences_sql =
+"SELECT\n"
+"    pg_class.oid seqoid,\n"
+"    pg_namespace.nspname seqschema,\n"
+"    pg_class.relname seqname\n"
+"FROM pg_class\n"
+"    JOIN pg_seqam ON (pg_seqam.oid = pg_class.relam)\n"
+"    JOIN pg_namespace ON (pg_class.relnamespace = pg_namespace.oid)\n"
+"WHERE\n"
+"    relkind = 'S'\n"
+"    AND seqamname = 'bdr'\n"
+"ORDER BY pg_class.oid\n"
+;
+
+
+const char *get_chunk_sql =
+"UPDATE bdr_sequence_values\n"
+"   SET in_use = true\n"
+"WHERE\n"
+"    (\n"
+"        owning_sysid,\n"
+"        owning_tlid,\n"
+"        owning_dboid,\n"
+"        owning_riname,\n"
+"        seqname,\n"
+"        seqschema,\n"
+"        seqrange\n"
+"    )\n"
+"    IN\n"
+"    (\n"
+"        SELECT\n"
+"            newval.owning_sysid,\n"
+"            newval.owning_tlid,\n"
+"            newval.owning_dboid,\n"
+"            newval.owning_riname,\n"
+"            newval.seqname,\n"
+"            newval.seqschema,\n"
+"            newval.seqrange\n"
+"        FROM bdr_sequence_values newval\n"
+"        WHERE\n"
+"           newval.confirmed\n"
+"           AND NOT newval.emptied\n"
+"           AND NOT newval.in_use\n"
+"           AND newval.owning_sysid = $1\n"
+"           AND newval.owning_tlid = $2\n"
+"           AND newval.owning_dboid = $3\n"
+"           AND newval.owning_riname = $4\n"
+"           AND newval.seqschema = $5\n"
+"           AND newval.seqname = $6\n"
+"        ORDER BY newval.seqrange ASC\n"
+"        LIMIT 1\n"
+"        FOR UPDATE\n"
+"    )\n"
+"RETURNING\n"
+"    lower(seqrange),\n"
+"    upper(seqrange)\n"
+;
+
+static Size
+bdr_sequencer_shmem_size(void)
+{
+   Size        size = 0;
+
+   size = add_size(size, sizeof(BdrSequencerControl));
+   size = add_size(size, mul_size(bdr_seq_nsequencers, sizeof(BdrSequencerSlot)));
+
+   return size;
+}
+
+static void
+bdr_sequencer_shmem_shutdown(int code, Datum arg)
+{
+   BdrSequencerSlot *slot;
+   if (bdr_sequencer_con == NULL)
+       return;
+
+   slot = &BdrSequencerCtl->slots[bdr_sequencer_con->slot];
+
+   slot->database_oid = InvalidOid;
+   slot->proclatch = NULL;
+}
+
+static void
+bdr_sequencer_shmem_startup(void)
+{
+   bool        found;
+
+   if (prev_shmem_startup_hook)
+       prev_shmem_startup_hook();
+
+   LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+   BdrSequencerCtl = ShmemInitStruct("bdr_sequencer",
+                                     bdr_sequencer_shmem_size(),
+                                     &found);
+   if (!found)
+   {
+       /* initialize */
+       memset(BdrSequencerCtl, 0, bdr_sequencer_shmem_size());
+   }
+   LWLockRelease(AddinShmemInitLock);
+
+   on_shmem_exit(bdr_sequencer_shmem_shutdown, (Datum) 0);
+}
+
+void
+bdr_sequencer_shmem_init(int nnodes, int sequencers)
+{
+   Assert(process_shared_preload_libraries_in_progress);
+
+   bdr_seq_nnodes = nnodes;
+   bdr_seq_nsequencers = sequencers;
+
+   RequestAddinShmemSpace(bdr_sequencer_shmem_size());
+
+   prev_shmem_startup_hook = shmem_startup_hook;
+   shmem_startup_hook = bdr_sequencer_shmem_startup;
+}
+
+void
+bdr_sequencer_wakeup(void)
+{
+   int off;
+   BdrSequencerSlot *slot;
+
+
+   for (off = 0; off < bdr_seq_nnodes; off++)
+   {
+       slot = &BdrSequencerCtl->slots[off];
+
+       /* FIXME: locking! */
+       if (slot->database_oid == InvalidOid)
+           continue;
+
+       if (slot->database_oid != MyDatabaseId)
+           continue;
+
+       SetLatch(slot->proclatch);
+   }
+}
+
+void
+bdr_sequencer_init(void)
+{
+   CreateExtensionStmt create_stmt;
+   AlterExtensionStmt alter_stmt;
+   BdrSequencerSlot *slot;
+
+   Assert(bdr_sequencer_con != NULL);
+
+   slot = &BdrSequencerCtl->slots[bdr_sequencer_con->slot];
+   slot->database_oid = MyDatabaseId;
+   slot->proclatch = &MyProc->procLatch;
+
+   create_stmt.if_not_exists = true;
+   create_stmt.options = NIL;
+
+   alter_stmt.options = NIL;
+
+   StartTransactionCommand();
+   PushActiveSnapshot(GetTransactionSnapshot());
+
+   create_stmt.extname = (char *)"btree_gist";
+   alter_stmt.extname = (char *)"btree_gist";
+
+   /* create extension if not exists */
+   CreateExtension(&create_stmt);
+   /* update extension otherwise */
+   ExecAlterExtensionStmt(&alter_stmt);
+
+   create_stmt.extname = (char *)"bdr";
+   alter_stmt.extname = (char *)"bdr";
+   /* create extension if not exists */
+   CreateExtension(&create_stmt);
+   /* update extension otherwise */
+   ExecAlterExtensionStmt(&alter_stmt);
+
+   PopActiveSnapshot();
+   CommitTransactionCommand();
+}
+
+static void
+bdr_sequencer_lock_rel(char *relname)
+{
+   Oid nspoid;
+   Oid relid;
+
+   nspoid = get_namespace_oid("bdr", false);
+   relid = get_relname_relid(relname, nspoid);
+   if (!relid)
+       elog(ERROR, "cache lookup failed for relation public.%s", relname);
+
+   SetCurrentStatementStartTimestamp();
+   pgstat_report_activity(STATE_RUNNING, relname);
+   LockRelationOid(relid, ExclusiveLock);
+}
+
+static void
+bdr_sequencer_lock(void)
+{
+
+   bdr_sequencer_lock_rel("bdr_sequence_elections");
+   bdr_sequencer_lock_rel("bdr_sequence_values");
+   bdr_sequencer_lock_rel("bdr_votes");
+}
+
+void
+bdr_sequencer_vote(void)
+{
+   Oid         argtypes[4];
+   Datum       values[4];
+   bool        nulls[4];
+   char        local_sysid[32];
+   int         ret;
+   int         my_processed;
+
+   snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+            GetSystemIdentifier());
+
+   argtypes[0] = TEXTOID;
+   nulls[0] = false;
+   values[0] = CStringGetTextDatum(local_sysid);
+
+   argtypes[1] = OIDOID;
+   nulls[1] = false;
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+   argtypes[2] = OIDOID;
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+   nulls[2] = false;
+
+   argtypes[3] = TEXTOID;
+   values[3] = CStringGetTextDatum("");
+   nulls[3] = false;
+
+   StartTransactionCommand();
+   SPI_connect();
+
+   bdr_sequencer_lock();
+   PushActiveSnapshot(GetTransactionSnapshot());
+
+again:
+   SetCurrentStatementStartTimestamp();
+   pgstat_report_activity(STATE_RUNNING, "sequence voting");
+   ret = SPI_execute_with_args(vote_sql, 4, argtypes,
+                               values, nulls, false, 0);
+
+   if (ret != SPI_OK_INSERT)
+       elog(ERROR, "blub");
+   my_processed = SPI_processed;
+   elog(LOG, "started %d votes", my_processed);
+
+   if (my_processed > 0)
+       goto again;
+
+   PopActiveSnapshot();
+   SPI_finish();
+   CommitTransactionCommand();
+
+}
+
+/*
+ * Check whether we need to initiate a voting procedure for getting new
+ * sequence chunks.
+ */
+void
+bdr_sequencer_start_elections(void)
+{
+   Oid         argtypes[4];
+   Datum       values[4];
+   bool        nulls[4];
+   char        local_sysid[32];
+   int         ret;
+
+   snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+            GetSystemIdentifier());
+
+   StartTransactionCommand();
+   SPI_connect();
+
+   bdr_sequencer_lock();
+   PushActiveSnapshot(GetTransactionSnapshot());
+
+   argtypes[0] = TEXTOID;
+   nulls[0] = false;
+   values[0] = CStringGetTextDatum(local_sysid);
+
+   argtypes[1] = OIDOID;
+   nulls[1] = false;
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+   argtypes[2] = OIDOID;
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+   nulls[2] = false;
+
+   argtypes[3] = TEXTOID;
+   values[3] = CStringGetTextDatum("");
+   nulls[3] = false;
+
+   SetCurrentStatementStartTimestamp();
+   pgstat_report_activity(STATE_RUNNING, "start_elections");
+   ret = SPI_execute_with_args(start_elections_sql, 4, argtypes,
+                               values, nulls, false, 0);
+
+   if (ret != SPI_OK_INSERT_RETURNING)
+       elog(ERROR, "blub");
+
+   elog(LOG, "started %d elections", SPI_processed);
+
+   PopActiveSnapshot();
+   SPI_finish();
+   CommitTransactionCommand();
+}
+
+/*
+ * Check whether enough votes have come in for any of *our* in progress
+ * elections.
+ */
+void
+bdr_sequencer_tally(void)
+{
+   Oid         argtypes[5];
+   Datum       values[5];
+   bool        nulls[5];
+   char        local_sysid[32];
+   int         ret;
+
+   snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+            GetSystemIdentifier());
+
+   StartTransactionCommand();
+   SPI_connect();
+
+   bdr_sequencer_lock();
+   PushActiveSnapshot(GetTransactionSnapshot());
+
+   argtypes[0] = TEXTOID;
+   nulls[0] = false;
+   values[0] = CStringGetTextDatum(local_sysid);
+
+   argtypes[1] = OIDOID;
+   nulls[1] = false;
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+   argtypes[2] = OIDOID;
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+   nulls[2] = false;
+
+   argtypes[3] = TEXTOID;
+   values[3] = CStringGetTextDatum("");
+   nulls[3] = false;
+
+   argtypes[4] = INT4OID;
+   values[4] = Int32GetDatum(bdr_sequencer_con->num_nodes);
+   nulls[4] = false;
+
+   SetCurrentStatementStartTimestamp();
+   pgstat_report_activity(STATE_RUNNING, "tally_elections");
+   ret = SPI_execute_with_args(tally_elections_sql, 5, argtypes,
+                               values, nulls, false, 0);
+
+   if (ret != SPI_OK_SELECT)
+       elog(ERROR, "blub");
+
+   elog(LOG, "tallied %d elections", SPI_processed);
+
+   PopActiveSnapshot();
+   SPI_finish();
+   CommitTransactionCommand();
+}
+
+
+static int
+bdr_sequence_value_cmp(const void *a, const void *b)
+{
+   const BdrSequenceValues *left = a;
+   const BdrSequenceValues *right = b;
+
+   if (left->start_value < right->start_value)
+       return -1;
+   if (left->start_value == right->start_value)
+       return 0;
+   return 1;
+}
+
+/*
+ * Replace a single (uninitialized or used up) chunk by a free one. Mark the
+ * new chunk from bdr_sequence_values as in_use.
+ *
+ * Returns whether we could find a chunk or not.
+ */
+static bool
+bdr_sequencer_fill_chunk(Oid seqoid, char *seqschema, char *seqname,
+                        BdrSequenceValues *curval)
+{
+   Oid         argtypes[6];
+   Datum       values[6];
+   bool        nulls[6];
+   char        local_sysid[32];
+   int         ret;
+   int64       lower, upper;
+   bool        success;
+
+   SPI_push();
+   SPI_connect();
+
+   snprintf(local_sysid, sizeof(local_sysid), UINT64_FORMAT,
+            GetSystemIdentifier());
+
+   argtypes[0] = TEXTOID;
+   nulls[0] = false;
+   values[0] = CStringGetTextDatum(local_sysid);
+
+   argtypes[1] = OIDOID;
+   nulls[1] = false;
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+
+   argtypes[2] = OIDOID;
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+   nulls[2] = false;
+
+   argtypes[3] = TEXTOID;
+   values[3] = CStringGetTextDatum("");
+   nulls[3] = false;
+
+   argtypes[4] = TEXTOID;
+   values[4] = CStringGetTextDatum(seqschema);
+   nulls[4] = false;
+
+   argtypes[5] = TEXTOID;
+   values[5] = CStringGetTextDatum(seqname);
+   nulls[5] = false;
+
+   SetCurrentStatementStartTimestamp();
+   pgstat_report_activity(STATE_RUNNING, "get_chunk");
+
+   ret = SPI_execute_with_args(get_chunk_sql, 6, argtypes,
+                               values, nulls, false, 0);
+   if (ret != SPI_OK_UPDATE_RETURNING)
+       elog(ERROR, "blart");
+
+   if (SPI_processed != 1)
+   {
+       elog(NOTICE, "no free chunks for sequence %s.%s",
+            seqschema, seqname);
+       success = false;
+   }
+   else
+   {
+       HeapTuple   tup = SPI_tuptable->vals[0];
+       bool        isnull;
+
+       lower = DatumGetInt64(SPI_getbinval(tup, SPI_tuptable->tupdesc, 1, &isnull));
+       Assert(!isnull);
+       upper = DatumGetInt64(SPI_getbinval(tup, SPI_tuptable->tupdesc, 2, &isnull));
+       Assert(!isnull);
+
+       elog(NOTICE, "got chunk [%zu, %zu) for sequence %s.%s",
+            lower, upper, seqschema, seqname);
+       curval->start_value = lower;
+       curval->next_value = lower;
+       curval->end_value = upper;
+
+       success = true;
+   }
+   SPI_finish();
+   SPI_pop();
+
+   return success;
+}
+
+/*
+ * Search for used up chunks in one bdr sequence.
+ */
+static void
+bdr_sequencer_fill_sequence(Oid seqoid, char *seqschema, char *seqname)
+{
+   Buffer      buf;
+   SeqTable    elm;
+   Relation    rel;
+   HeapTupleData seqtuple;
+   Datum       values[SEQ_COL_LASTCOL];
+   bool        nulls[SEQ_COL_LASTCOL];
+   HeapTuple   newtup;
+   Page        page, temppage;
+   BdrSequenceValues *curval, *firstval;
+   int i;
+   bool acquired_new = false;
+
+   elog(LOG, "checking sequence %u: %s.%s",
+        seqoid, seqschema, seqname);
+
+   /* lock page, fill heaptup */
+   init_sequence(seqoid, &elm, &rel);
+   (void) read_seq_tuple(elm, rel, &buf, &seqtuple);
+
+   /* get values */
+   heap_deform_tuple(&seqtuple, RelationGetDescr(rel),
+                     values, nulls);
+
+   /* now make sure we have space for our own data */
+   if (nulls[SEQ_COL_AMDATA - 1])
+   {
+       struct varlena *vl = palloc0(VARHDRSZ + sizeof(BdrSequenceValues) * 10);
+       SET_VARSIZE(vl, VARHDRSZ + sizeof(BdrSequenceValues) * 10);
+       nulls[SEQ_COL_AMDATA - 1] = false;
+       values[SEQ_COL_AMDATA - 1] = PointerGetDatum(vl);
+   }
+
+   firstval = (BdrSequenceValues *)
+       VARDATA_ANY(DatumGetByteaP(values[SEQ_COL_AMDATA - 1]));
+   curval = firstval;
+
+   START_CRIT_SECTION();
+
+   MarkBufferDirty(buf);
+
+   for (i = 0; i < 10; i ++)
+   {
+       if (curval->next_value == curval->end_value)
+       {
+           if (curval->end_value > 0)
+               elog(LOG, "used up old chunk");
+
+           elog(LOG, "need new batch %i", i);
+           if (bdr_sequencer_fill_chunk(seqoid, seqschema, seqname, curval))
+               acquired_new = true;
+           else
+               break;
+       }
+       curval++;
+   }
+
+   if (!acquired_new)
+       goto done_with_sequence;
+
+   /* sort chunks, so we always use the smallest one first */
+   qsort(firstval, 10, sizeof(BdrSequenceValues), bdr_sequence_value_cmp);
+
+   newtup = heap_form_tuple(RelationGetDescr(rel), values, nulls);
+
+   /* special requirements for sequence tuples */
+   HeapTupleHeaderSetXmin(newtup->t_data, FrozenTransactionId);
+   newtup->t_data->t_infomask |= HEAP_XMIN_COMMITTED;
+   newtup->t_data->t_infomask |= HEAP_XMAX_INVALID;
+
+   page = BufferGetPage(buf);
+   temppage = PageGetTempPage(page);
+
+   /* replace page contents, the direct way */
+   PageInit(temppage, BufferGetPageSize(buf), PageGetSpecialSize(page));
+   memcpy(PageGetSpecialPointer(temppage),
+          PageGetSpecialPointer(page),
+          PageGetSpecialSize(page));
+
+   if (PageAddItem(temppage, (Item) newtup->t_data, newtup->t_len,
+                   FirstOffsetNumber, false, false) == InvalidOffsetNumber)
+       elog(PANIC, "fill_sequence: failed to add item to page");
+
+   PageSetLSN(temppage, PageGetLSN(page));
+
+   memcpy(page, temppage, BufferGetPageSize(buf));
+
+   seqtuple.t_len = newtup->t_len;
+
+   log_sequence_tuple(rel, &seqtuple, page);
+
+   END_CRIT_SECTION();
+
+done_with_sequence:
+   UnlockReleaseBuffer(buf);
+   heap_close(rel, NoLock);
+}
+
+/*
+ * Check whether all BDR sequences have enough values inline. If not, add
+ * some. This should be called after tallying (so we have a better chance to
+ * have enough chunks) but before starting new elections since we might use up
+ * existing chunks.
+ */
+void
+bdr_sequencer_fill_sequences(void)
+{
+   SPIPlanPtr  plan;
+   Portal      cursor;
+
+   StartTransactionCommand();
+   SPI_connect();
+
+   bdr_sequencer_lock();
+   PushActiveSnapshot(GetTransactionSnapshot());
+
+   plan = SPI_prepare(fill_sequences_sql, 0, NULL);
+   cursor = SPI_cursor_open("seq", plan, NULL, NULL, 0);
+
+   SetCurrentStatementStartTimestamp();
+   pgstat_report_activity(STATE_RUNNING, "fill_sequences");
+
+   SPI_cursor_fetch(cursor, true, 1);
+
+   while (SPI_processed > 0)
+   {
+       HeapTuple   tup = SPI_tuptable->vals[0];
+       bool        isnull;
+       Datum       seqoid;
+       Datum       seqschema;
+       Datum       seqname;
+
+       seqoid = SPI_getbinval(tup, SPI_tuptable->tupdesc, 1, &isnull);
+       Assert(!isnull);
+       seqschema = SPI_getbinval(tup, SPI_tuptable->tupdesc, 2, &isnull);
+       Assert(!isnull);
+       seqname = SPI_getbinval(tup, SPI_tuptable->tupdesc, 3, &isnull);
+       Assert(!isnull);
+
+       bdr_sequencer_fill_sequence(DatumGetObjectId(seqoid),
+                                   NameStr(*DatumGetName(seqschema)),
+                                   NameStr(*DatumGetName(seqname)));
+
+       SPI_cursor_fetch(cursor, true, 1);
+   }
+
+   PopActiveSnapshot();
+   SPI_finish();
+   CommitTransactionCommand();
+}
+
+
+/* check sequence.c */
+#define SEQ_LOG_VALS   32
+
+PG_FUNCTION_INFO_V1(bdr_sequence_alloc);
+void
+bdr_sequence_alloc(PG_FUNCTION_ARGS)
+{
+   Relation    seqrel = (Relation) PG_GETARG_POINTER(0);
+   SeqTable    elm = (SeqTable) PG_GETARG_POINTER(1);
+   Buffer      buf = (Buffer) PG_GETARG_INT32(2);
+   HeapTuple   seqtuple = (HeapTuple) PG_GETARG_POINTER(3);
+   Page        page;
+   Form_pg_sequence seq;
+   bool        logit = false;
+   int64       cache,
+               log,
+               fetch,
+               last;
+   int64       result = 0;
+   int64       next;
+   Datum       values;
+   bool        isnull;
+   BdrSequenceValues *curval;
+   int         i;
+   bool        wakeup = false;
+
+   page = BufferGetPage(buf);
+   seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
+
+   values = fastgetattr(seqtuple, 11, RelationGetDescr(seqrel), &isnull);
+   if (isnull)
+       elog(ERROR, "uninitialized sequence");
+
+   curval = (BdrSequenceValues *) VARDATA_ANY(DatumGetByteaP(values));
+
+   Assert(seq->increment_by == 1);
+   /* XXX: check min/max */
+
+   last = next = seq->last_value;
+
+   fetch = cache = seq->cache_value;
+   log = seq->log_cnt;
+
+   /* check whether value can be satisfied without logging again */
+   if (log < fetch || !seq->is_called || PageGetLSN(page) <= GetRedoRecPtr())
+   {
+       /* forced log to satisfy local demand for values */
+       fetch = log = fetch + SEQ_LOG_VALS;
+       logit = true;
+   }
+
+   /*
+    * try to fetch cache [+ log ] numbers, check all 10 possible chunks
+    */
+   for (i = 0; i < 10; i ++)
+   {
+       /* redo recovered after crash*/
+       if (seq->last_value >= curval->next_value &&
+           seq->last_value < curval->end_value)
+       {
+           curval->next_value = seq->last_value + 1;
+       }
+
+       /* chunk empty */
+       if (curval->next_value >= curval->end_value)
+       {
+           curval++;
+           continue;
+       }
+
+
+       /* there's space in current chunk, use it */
+       result = curval->next_value;
+
+       /* but not enough for all ..log values */
+       if (result + log >= curval->end_value)
+       {
+           log = curval->end_value - curval->next_value;
+           wakeup = true;
+           logit = true;
+       }
+
+       /* but not enough for all ..cached values */
+       last = result + cache - 1;
+       if (last >= curval->end_value)
+       {
+           last = curval->end_value - 1;
+           wakeup = true;
+           logit = true;
+       }
+
+       curval->next_value = last;
+       break;
+   }
+
+   if (result == 0)
+   {
+       bdr_sequencer_wakeup();
+       ereport(ERROR,
+               (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+                errmsg("could not find free sequence value")));
+   }
+
+   if (wakeup)
+       bdr_sequencer_wakeup();
+
+   elm->last = result;
+   elm->cached = result;
+   elm->last_valid = true;
+
+   /* ready to change the on-disk (or really, in-buffer) tuple */
+   START_CRIT_SECTION();
+
+   /*
+    * We must mark the buffer dirty before doing XLogInsert(); see notes in
+    * SyncOneBuffer().  However, we don't apply the desired changes just yet.
+    * This looks like a violation of the buffer update protocol, but it is
+    * in fact safe because we hold exclusive lock on the buffer.  Any other
+    * process, including a checkpoint, that tries to examine the buffer
+    * contents will block until we release the lock, and then will see the
+    * final state that we install below.
+    */
+   MarkBufferDirty(buf);
+
+   if (logit)
+   {
+       /*
+        * We don't log the current state of the tuple, but rather the state
+        * as it would appear after "log" more fetches.  This lets us skip
+        * that many future WAL records, at the cost that we lose those
+        * sequence values if we crash.
+        */
+       seq->last_value = next;
+       seq->is_called = true;
+       seq->log_cnt = 0;
+       log_sequence_tuple(seqrel, seqtuple, page);
+   }
+
+   /* Now update sequence tuple to the intended final state */
+   seq->last_value = elm->last; /* last fetched number */
+   seq->is_called = true;
+   seq->log_cnt = log; /* how much is logged */
+
+   result = elm->last;
+
+   END_CRIT_SECTION();
+}
+
+PG_FUNCTION_INFO_V1(bdr_sequence_setval);
+void
+bdr_sequence_setval(PG_FUNCTION_ARGS)
+{
+   Relation    seqrel = (Relation) PG_GETARG_POINTER(0);
+   Buffer      buf = (Buffer) PG_GETARG_INT32(2);
+   HeapTuple   seqtuple = (HeapTuple) PG_GETARG_POINTER(3);
+   int64       next = PG_GETARG_INT64(4);
+   bool        iscalled = PG_GETARG_BOOL(5);
+   Page        page = BufferGetPage(buf);
+   Form_pg_sequence seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
+
+   /* ready to change the on-disk (or really, in-buffer) tuple */
+   START_CRIT_SECTION();
+
+   /* set is_called, all AMs should need to do this */
+   seq->is_called = iscalled;
+   seq->last_value = next;     /* last fetched number */
+   seq->log_cnt = 0;
+
+   MarkBufferDirty(buf);
+
+   log_sequence_tuple(seqrel, seqtuple, page);
+
+   END_CRIT_SECTION();
+}
+
+PG_FUNCTION_INFO_V1(bdr_sequence_options);
+Datum
+bdr_sequence_options(PG_FUNCTION_ARGS)
+{
+   Datum       reloptions = PG_GETARG_DATUM(0);
+   bool        validate = PG_GETARG_BOOL(1);
+   bytea      *result;
+
+   result = default_reloptions(reloptions, validate, RELOPT_KIND_SEQUENCE);
+   if (result)
+       PG_RETURN_BYTEA_P(result);
+
+   PG_RETURN_NULL();
+}
diff --git a/contrib/bdr/expected/extension.out b/contrib/bdr/expected/extension.out
new file mode 100644 (file)
index 0000000..18ab061
--- /dev/null
@@ -0,0 +1,20 @@
+-- check extension creation works
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
+SELECT * FROM bdr.pg_stat_bdr;
+ rep_node_id | riremotesysid | riremotedb | rilocaldb | nr_commit | nr_rollback | nr_insert | nr_insert_conflict | nr_update | nr_update_conflict | nr_delete | nr_delete_conflict | nr_disconnect 
+-------------+---------------+------------+-----------+-----------+-------------+-----------+--------------------+-----------+--------------------+-----------+--------------------+---------------
+(0 rows)
+
+-- check that only superusers can do this
+CREATE ROLE bdr_no_special_perms;
+SET ROLE bdr_no_special_perms;
+SELECT * FROM bdr.pg_stat_bdr;
+ERROR:  permission denied for relation pg_stat_bdr
+SELECT * FROM bdr.pg_stat_get_bdr();
+ERROR:  permission denied for function pg_stat_get_bdr
+-- reacquire permissions, drop extension, role
+RESET role;
+DROP EXTENSION bdr;
+DROP EXTENSION btree_gist;
+DROP ROLE bdr_no_special_perms;
diff --git a/contrib/bdr/output.mk b/contrib/bdr/output.mk
new file mode 100644 (file)
index 0000000..5a33339
--- /dev/null
@@ -0,0 +1,19 @@
+# contrib/bdr/output.mk
+
+MODULE_big = bdr_output
+OBJS = bdr_output.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK = $(libpq)
+SHLIB_PREREQS = submake-libpq
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/bdr
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/bdr/sql/extension.sql b/contrib/bdr/sql/extension.sql
new file mode 100644 (file)
index 0000000..84574eb
--- /dev/null
@@ -0,0 +1,16 @@
+-- check extension creation works
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
+SELECT * FROM bdr.pg_stat_bdr;
+
+-- check that only superusers can do this
+CREATE ROLE bdr_no_special_perms;
+SET ROLE bdr_no_special_perms;
+SELECT * FROM bdr.pg_stat_bdr;
+SELECT * FROM bdr.pg_stat_get_bdr();
+
+-- reacquire permissions, drop extension, role
+RESET role;
+DROP EXTENSION bdr;
+DROP EXTENSION btree_gist;
+DROP ROLE bdr_no_special_perms;
diff --git a/contrib/bdr/worker.mk b/contrib/bdr/worker.mk
new file mode 100644 (file)
index 0000000..7520b42
--- /dev/null
@@ -0,0 +1,42 @@
+# contrib/bdr/worker.mk
+
+MODULE_big = bdr
+OBJS = bdr.o bdr_apply.o bdr_count.o bdr_seq.o
+
+EXTENSION = bdr
+DATA = bdr--0.5.sql
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK = $(libpq)
+SHLIB_PREREQS = submake-libpq
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/bdr
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+# Disabled because these tests require "wal_level=logical", which
+# typical installcheck users do not have (e.g. buildfarm clients).
+installcheck:;
+
+submake-regress:
+   $(MAKE) -C $(top_builddir)/src/test/regress
+
+submake-btree_gist:
+   $(MAKE) -C $(top_builddir)/contrib/btree_gist
+
+check: all | submake-regress submake-btree_gist
+   $(pg_regress_check) \
+       --temp-config $(top_srcdir)/contrib/bdr/bdr.conf \
+       --temp-install=./tmp_check \
+       --extra-install=contrib/btree_gist \
+       --extra-install=contrib/bdr \
+       extension
+
+PHONY: submake-regress
index 47810b48c9235430a872935edad3eabb5bd50c06..99cbc14625bb4027c751d3a00b4b7dedd777d441 100644 (file)
@@ -50,7 +50,7 @@ my $contrib_extraincludes =
 my $contrib_extrasource = {
    'cube' => [ 'cubescan.l', 'cubeparse.y' ],
    'seg'  => [ 'segscan.l',  'segparse.y' ], };
-my @contrib_excludes = ('pgcrypto', 'intagg', 'sepgsql');
+my @contrib_excludes = ('pgcrypto', 'intagg', 'sepgsql', 'bdr');
 
 sub mkvcbuild
 {
@@ -510,6 +510,20 @@ sub mkvcbuild
    my $mf = Project::read_file('contrib/pgcrypto/Makefile');
    GenerateContribSqlFiles('pgcrypto', $mf);
 
+   # so is bdr
+   my $bdr_output = $solution->AddProject('bdr_output', 'dll', 'misc');
+   $bdr_output->AddFiles('contrib\bdr', 'bdr_output.c');
+   $bdr_output->AddReference($postgres);
+   $bdr_output->AddLibrary('wsock32.lib');
+
+   my $bdr_apply = $solution->AddProject('bdr_apply', 'dll', 'misc');
+   $bdr_apply->AddFiles('contrib\bdr', 'bdr.c', 'bdr_apply.c',
+                'bdr_count.c', 'bdr_seq.c');
+   $bdr_apply->AddReference($postgres);
+   $bdr_apply->AddLibrary('wsock32.lib');
+   $bdr_apply->AddIncludeDir('src\interfaces\libpq');
+   $bdr_apply->AddReference($libpq);
+
    my $D;
    opendir($D, 'contrib') || croak "Could not opendir on contrib!\n";
    while (my $d = readdir($D))