Import Simon's hot standby patch v9d. hotstandbyv9d
authorHeikki Linnakangas <[email protected]>
Fri, 23 Jan 2009 12:34:24 +0000 (14:34 +0200)
committerHeikki Linnakangas <[email protected]>
Fri, 23 Jan 2009 12:34:24 +0000 (14:34 +0200)
78 files changed:
doc/src/sgml/config.sgml
doc/src/sgml/func.sgml
src/backend/access/gin/ginxlog.c
src/backend/access/gist/gistxlog.c
src/backend/access/heap/heapam.c
src/backend/access/heap/pruneheap.c
src/backend/access/index/genam.c
src/backend/access/index/indexam.c
src/backend/access/nbtree/README
src/backend/access/nbtree/nbtinsert.c
src/backend/access/nbtree/nbtpage.c
src/backend/access/nbtree/nbtree.c
src/backend/access/nbtree/nbtxlog.c
src/backend/access/transam/README
src/backend/access/transam/clog.c
src/backend/access/transam/multixact.c
src/backend/access/transam/rmgr.c
src/backend/access/transam/slru.c
src/backend/access/transam/subtrans.c
src/backend/access/transam/transam.c
src/backend/access/transam/twophase.c
src/backend/access/transam/varsup.c
src/backend/access/transam/xact.c
src/backend/access/transam/xlog.c
src/backend/catalog/storage.c
src/backend/commands/dbcommands.c
src/backend/commands/discard.c
src/backend/commands/indexcmds.c
src/backend/commands/lockcmds.c
src/backend/commands/sequence.c
src/backend/commands/tablespace.c
src/backend/commands/user.c
src/backend/commands/vacuum.c
src/backend/commands/vacuumlazy.c
src/backend/postmaster/bgwriter.c
src/backend/postmaster/postmaster.c
src/backend/storage/buffer/README
src/backend/storage/buffer/bufmgr.c
src/backend/storage/ipc/procarray.c
src/backend/storage/ipc/sinvaladt.c
src/backend/storage/lmgr/lock.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/proc.c
src/backend/tcop/postgres.c
src/backend/tcop/utility.c
src/backend/utils/adt/txid.c
src/backend/utils/cache/inval.c
src/backend/utils/error/elog.c
src/backend/utils/init/flatfiles.c
src/backend/utils/init/postinit.c
src/backend/utils/misc/guc.c
src/backend/utils/time/snapmgr.c
src/backend/utils/time/tqual.c
src/include/access/heapam.h
src/include/access/htup.h
src/include/access/nbtree.h
src/include/access/relscan.h
src/include/access/rmgr.h
src/include/access/subtrans.h
src/include/access/transam.h
src/include/access/xact.h
src/include/access/xlog.h
src/include/access/xlog_internal.h
src/include/catalog/pg_control.h
src/include/catalog/pg_proc.h
src/include/miscadmin.h
src/include/postmaster/bgwriter.h
src/include/storage/bufmgr.h
src/include/storage/lwlock.h
src/include/storage/proc.h
src/include/storage/procarray.h
src/include/storage/sinval.h
src/include/storage/sinvaladt.h
src/include/utils/flatfiles.h
src/include/utils/inval.h
src/include/utils/snapshot.h
src/test/regress/parallel_schedule
src/test/regress/serial_schedule

index ec9a46d778a02ff80ffb607d92fdcd80f6be3f26..63d93022bb5cd25975bd883c3f7f407a1dac951d 100644 (file)
@@ -370,6 +370,12 @@ SET ENABLE_SEQSCAN TO OFF;
         allows. See <xref linkend="sysvipc"> for information on how to
         adjust those parameters, if necessary.
        </para>
+
+       <para>
+       When running a standby server it is strongly recommended that you
+       set this parameter to be the same or higher than the master server,
+       otherwise queries on the standby server may fail.
+       </para>
       </listitem>
      </varlistentry>
 
@@ -5383,6 +5389,32 @@ plruby.use_strict = true        # generates error: unknown class name
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-trace-recovery-messages" xreflabel="trace_recovery_messages">
+      <term><varname>trace_recovery_messages</varname> (<type>string</type>)</term>
+      <indexterm>
+       <primary><varname>trace_recovery_messages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Controls which message levels are written to the server log
+               for system modules needed for recovery processing. This allows
+               the user to override the normal setting of log_min_messages,
+               but only for specific messages. This is intended for use in
+               debugging Hot Standby.
+        Valid values are <literal>DEBUG5</>, <literal>DEBUG4</>,
+        <literal>DEBUG3</>, <literal>DEBUG2</>, <literal>DEBUG1</>,
+        <literal>INFO</>, <literal>NOTICE</>, <literal>WARNING</>,
+        <literal>ERROR</>, <literal>LOG</>, <literal>FATAL</>, and
+        <literal>PANIC</>.  Each level includes all the levels that
+        follow it.  The later the level, the fewer messages are sent
+        to the log.  The default is <literal>WARNING</>.  Note that
+        <literal>LOG</> has a different rank here than in
+        <varname>client_min_messages</>.
+        Parameter should be set in the postgresql.conf only.
+       </para>
+      </listitem>
+     </varlistentry>
+
     <varlistentry id="guc-zero-damaged-pages" xreflabel="zero_damaged_pages">
       <term><varname>zero_damaged_pages</varname> (<type>boolean</type>)</term>
       <indexterm>
index 1900d6a5fca9489dfa724903530e892c1d1e18ab..c835ddc0695d928f4dd2f84b81df654fdc9e35ce 100644 (file)
@@ -12894,6 +12894,193 @@ postgres=# select * from pg_xlogfile_name_offset(pg_stop_backup());
     <xref linkend="continuous-archiving">.
    </para>
 
+   <indexterm>
+    <primary>pg_is_in_recovery</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xact_timestamp</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xid</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xlog_location</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_continue</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause_xid</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause_time</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_stop</primary>
+   </indexterm>
+
+   <para>
+    The functions shown in <xref
+    linkend="functions-admin-recovery-table"> assist in archive recovery.
+    Except for the first three functions, these are restricted to superusers.
+       All of these functions can only be executed during recovery.
+   </para>
+
+   <table id="functions-admin-recovery-table">
+    <title>Recovery Control Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row><entry>Name</entry> <entry>Return Type</entry> <entry>Description</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry>
+        <literal><function>pg_is_in_recovery</function>()</literal>
+        </entry>
+       <entry><type>bool</type></entry>
+       <entry>True if recovery is still in progress.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xact_timestamp</function>()</literal>
+        </entry>
+       <entry><type>timestamp with time zone</type></entry>
+       <entry>Returns the original completion timestamp with timezone of the 
+               last recovered transaction. If recovery is still in progress this
+               will increase monotonically while if recovery is complete then this 
+               value will remain static at the value of the last transaction applied
+               during that recovery. When the server has been started normally this 
+               will return a default value.
+          </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xid</function>()</literal>
+        </entry>
+       <entry><type>integer</type></entry>
+       <entry>Returns the transaction id (32-bit) of last completed transaction
+               in the current recovery. Later numbered transaction ids may already have
+               completed, so the value could in some cases be lower than the last time
+               this function executed. If recovery is complete then this value will 
+               remain static at the value of the last transaction applied during that
+               recovery. When the server has been started normally this will return 
+               InvalidXid (zero).
+          </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xlog_location</function>()</literal>
+        </entry>
+       <entry><type>text</type></entry>
+       <entry>Returns the transaction log location of the last WAL record
+               in the current recovery. If recovery is still in progress this
+               will increase monotonically. If recovery is complete then this value will 
+               remain static at the value of the last transaction applied during that
+               recovery. When the server has been started normally this will return 
+               InvalidXLogRecPtr (0/0).
+               (zero).
+          </entry>
+      </row>
+
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Pause recovery processing, unconditionally.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_continue</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>If recovery is paused, continue processing.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_stop</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>End recovery and begin normal processing.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause_xid</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Continue recovery until specified xid completes, if it is ever 
+               seen, then pause recovery.
+          </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause_time</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Continue recovery until a transaction with specified timestamp 
+               completes, if one is ever seen, then pause recovery.
+          </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_advance</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Advance recovery specified number of records then pause.</entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    <function>pg_recovery_pause</> and <function>pg_recovery_continue</> allow
+       a superuser to control the progress of recovery on the database server.
+       While recovery is paused queries can then be executed to determine how far
+       forwards recovery should progress. Recovery can never go backwards
+       because previous values are overwritten.  If the superuser wishes recovery
+       to complete and normal processing mode to start, execute 
+       <function>pg_recovery_stop</>.
+   </para>
+
+   <para>
+       Variations of the pause function exist, mainly to allow PITR to dynamically
+       control where it should progress to. <function>pg_recovery_pause_xid</> and 
+       <function>pg_recovery_pause_time</> allow the specification of a trial
+       recovery target, similarly to <xref linkend="recovery-config-settings">.
+       Recovery will then progress to the specified point and then pause, rather
+       than stopping permanently, allowing assessment of whether this is the
+       desired stopping point for recovery.
+   </para>
+
+   <para>
+       <function>pg_recovery_advance</> allows recovery to progress record by
+       record, for very careful analysis or debugging. Step size can be 1 or
+       more records. If recovery is not yet paused then <function>pg_recovery_advance</>
+       will process the specified number of records then pause. If recovery
+       is already paused, recovery will continue for another N records before
+       pausing again.
+   </para>
+
+   <para>
+       If you pause recovery while the server is waiting for a WAL file when 
+       operating in standby mode it will have apparently no effect until the 
+       file arrives. Once the server begins processing WAL records again it
+       will notice the pause request and will act upon it. This is not a bug.
+       pause.
+   </para>
+
+   <para>
+       Pausing recovery will also prevent restartpoints from starting since they
+       are triggered by events in the WAL stream. In all other ways processing
+       will continue, for example the background writer will continue to clean
+       shared_buffers while paused.
+   </para>
+
    <para>
     The functions shown in <xref linkend="functions-admin-dbsize"> calculate
     the actual disk space usage of database objects.
index 8382576d3c07879f1b9312b44b3eb31d59911886..7661c97fa39b5c12c00478fe2b1d092f305c0d12 100644 (file)
@@ -14,6 +14,7 @@
 #include "postgres.h"
 
 #include "access/gin.h"
+#include "access/xact.h"
 #include "access/xlogutils.h"
 #include "storage/bufmgr.h"
 #include "utils/memutils.h"
@@ -438,6 +439,9 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
        RestoreBkpBlocks(lsn, record, false);
 
        topCtx = MemoryContextSwitchTo(opCtx);
index 4a20d905d4e4bb70c404444ec254bac8eeea5128..3888bca9454f41548578b9ea7819f9ea233a0eba 100644 (file)
@@ -14,6 +14,7 @@
 #include "postgres.h"
 
 #include "access/gist_private.h"
+#include "access/xact.h"
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
@@ -396,6 +397,9 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
        MemoryContext oldCxt;
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
        RestoreBkpBlocks(lsn, record, false);
 
        oldCxt = MemoryContextSwitchTo(opCtx);
index 52115cf64e1832679b4762ed4e025ba3be29d1ea..f2b45a2e637f913c9171b6318a3b8695269dcb21 100644 (file)
@@ -3813,6 +3813,61 @@ heap_restrpos(HeapScanDesc scan)
        }
 }
 
+/*
+ * Update the latestRemovedXid for the current VACUUM. This gets called
+ * only rarely, since we probably already removed rows earlier.
+ * see comments for vacuum_log_cleanup_info().
+ */
+void
+HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, 
+                                                                               TransactionId *latestRemovedXid)
+{
+       TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+       TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+       TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+       if (tuple->t_infomask & HEAP_MOVED_OFF || 
+               tuple->t_infomask & HEAP_MOVED_IN)
+       {
+               if (TransactionIdPrecedes(*latestRemovedXid, xvac))
+                       *latestRemovedXid = xvac;
+       }
+
+       if (TransactionIdPrecedes(*latestRemovedXid, xmax))
+               *latestRemovedXid = xmax;
+
+       if (TransactionIdPrecedes(*latestRemovedXid, xmin))
+               *latestRemovedXid = xmin;
+
+       Assert(TransactionIdIsValid(*latestRemovedXid));
+}
+
+/*
+ * Perform XLogInsert to register a heap cleanup info message. These
+ * messages are sent once per VACUUM and are required because
+ * of the phasing of removal operations during a lazy VACUUM.
+ * see comments for vacuum_log_cleanup_info().
+ */
+XLogRecPtr
+log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
+{
+       xl_heap_cleanup_info xlrec;
+       XLogRecPtr      recptr;
+       XLogRecData rdata;
+
+       xlrec.node = rnode;
+       xlrec.latestRemovedXid = latestRemovedXid;
+
+       rdata.data = (char *) &xlrec;
+       rdata.len = SizeOfHeapCleanupInfo;
+       rdata.buffer = InvalidBuffer;
+       rdata.next = NULL;
+
+       recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata);
+
+       return recptr;
+}
+
 /*
  * Perform XLogInsert for a heap-clean operation.  Caller must already
  * have modified the buffer and marked it dirty.
@@ -3820,13 +3875,17 @@ heap_restrpos(HeapScanDesc scan)
  * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
  * zero-based tuple indexes.  Now they are one-based like other uses
  * of OffsetNumber.
+ *
+ * For 8.4 we also include the latestRemovedXid which allows recovery
+ * processing to cancel long standby queries that would be have their 
+ * results changed if we applied these changes.
  */
 XLogRecPtr
 log_heap_clean(Relation reln, Buffer buffer,
                           OffsetNumber *redirected, int nredirected,
                           OffsetNumber *nowdead, int ndead,
                           OffsetNumber *nowunused, int nunused,
-                          bool redirect_move)
+                          TransactionId latestRemovedXid, bool redirect_move)
 {
        xl_heap_clean xlrec;
        uint8           info;
@@ -3838,6 +3897,7 @@ log_heap_clean(Relation reln, Buffer buffer,
 
        xlrec.node = reln->rd_node;
        xlrec.block = BufferGetBlockNumber(buffer);
+       xlrec.latestRemovedXid = latestRemovedXid;
        xlrec.nredirected = nredirected;
        xlrec.ndead = ndead;
 
@@ -4108,6 +4168,46 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
        return recptr;
 }
 
+/*
+ * Handles CLEANUP_INFO
+ */
+static void
+heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
+
+       if (InArchiveRecovery && 
+               RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid) &&
+               LatestRemovedXidAdvances(xlrec->latestRemovedXid))
+       {
+               VirtualTransactionId *old_snapshots;
+
+               /*
+                * Make sure the incoming transaction is emulated as running
+                * prior to allowing any changes that could effect correctness
+                * of MVCC for standby queries.
+                *
+                * Note that we will specifically exclude sessions with no
+                * current snapshot, specifically idle in transaction sessions
+                * that are neither serializable nor have active cursors.
+                */
+           old_snapshots = GetConflictingVirtualXIDs(xlrec->latestRemovedXid, 
+                                                                                                       xlrec->node.dbNode,
+                                                                       InvalidTransactionId);
+
+               ResolveRecoveryConflictWithVirtualXIDs(old_snapshots,
+                                                                                               "heap cleanup info",
+                                                                                               ERROR,
+                                                                                               lsn);
+       }
+
+       /* 
+        * Actual operation is a no-op. Record type exists to provide a means
+        * for conflict processing to occur before we begin index vacuum actions.
+        * see vacuumlazy.c
+        */
+}
+
 /*
  * Handles CLEAN and CLEAN_MOVE record types
  */
@@ -4126,12 +4226,34 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
        int                     nunused;
        Size            freespace;
 
+       if (InArchiveRecovery && 
+               RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid) &&
+               LatestRemovedXidAdvances(xlrec->latestRemovedXid))
+       {
+               VirtualTransactionId *old_snapshots;
+
+               /*
+                * see comments in heap_xlog_clean_info()
+                */
+           old_snapshots = GetConflictingVirtualXIDs(xlrec->latestRemovedXid, 
+                                                                                                       xlrec->node.dbNode,
+                                                                       InvalidTransactionId);
+
+               ResolveRecoveryConflictWithVirtualXIDs(old_snapshots,
+                                                                                               "heap cleanup",
+                                                                                               ERROR,
+                                                                                               lsn);
+       }
+
+       RestoreBkpBlocks(lsn, record, true);
+
        if (record->xl_info & XLR_BKP_BLOCK_1)
                return;
 
-       buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
        if (!BufferIsValid(buffer))
                return;
+       LockBufferForCleanup(buffer);
        page = (Page) BufferGetPage(buffer);
 
        if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4186,12 +4308,18 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
        Buffer          buffer;
        Page            page;
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
+       RestoreBkpBlocks(lsn, record, false);
+
        if (record->xl_info & XLR_BKP_BLOCK_1)
                return;
 
-       buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
        if (!BufferIsValid(buffer))
                return;
+       LockBufferForCleanup(buffer);
        page = (Page) BufferGetPage(buffer);
 
        if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4777,6 +4905,9 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
        RestoreBkpBlocks(lsn, record, false);
 
        switch (info & XLOG_HEAP_OPMASK)
@@ -4818,17 +4949,17 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
        switch (info & XLOG_HEAP_OPMASK)
        {
                case XLOG_HEAP2_FREEZE:
-                       RestoreBkpBlocks(lsn, record, false);
                        heap_xlog_freeze(lsn, record);
                        break;
                case XLOG_HEAP2_CLEAN:
-                       RestoreBkpBlocks(lsn, record, true);
                        heap_xlog_clean(lsn, record, false);
                        break;
                case XLOG_HEAP2_CLEAN_MOVE:
-                       RestoreBkpBlocks(lsn, record, true);
                        heap_xlog_clean(lsn, record, true);
                        break;
+               case XLOG_HEAP2_CLEANUP_INFO:
+                       heap_xlog_cleanup_info(lsn, record);
+                       break;
                default:
                        elog(PANIC, "heap2_redo: unknown op code %u", info);
        }
@@ -4958,17 +5089,26 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
        {
                xl_heap_clean *xlrec = (xl_heap_clean *) rec;
 
-               appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+               appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
                                                 xlrec->node.spcNode, xlrec->node.dbNode,
-                                                xlrec->node.relNode, xlrec->block);
+                                                xlrec->node.relNode, xlrec->block,
+                                                xlrec->latestRemovedXid);
        }
        else if (info == XLOG_HEAP2_CLEAN_MOVE)
        {
                xl_heap_clean *xlrec = (xl_heap_clean *) rec;
 
-               appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+               appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u remxid %u",
                                                 xlrec->node.spcNode, xlrec->node.dbNode,
-                                                xlrec->node.relNode, xlrec->block);
+                                                xlrec->node.relNode, xlrec->block,
+                                                xlrec->latestRemovedXid);
+       }
+       else if (info == XLOG_HEAP2_CLEANUP_INFO)
+       {
+               xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
+
+               appendStringInfo(buf, "cleanup info: remxid %u",
+                                                xlrec->latestRemovedXid);
        }
        else
                appendStringInfo(buf, "UNKNOWN");
index 2691666e39ced2c6a84e61837a91d7898c5d4e9a..8c8bbd83557700fdae0b8599b64ba51c558af0ca 100644 (file)
@@ -30,6 +30,7 @@
 typedef struct
 {
        TransactionId new_prune_xid;    /* new prune hint value for page */
+       TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
        int                     nredirected;            /* numbers of entries in arrays below */
        int                     ndead;
        int                     nunused;
@@ -84,6 +85,14 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
        if (!PageIsPrunable(page, OldestXmin))
                return;
 
+       /*
+        * We can't write WAL in recovery mode, so there's no point trying to
+        * clean the page. The master will likely issue a cleaning WAL record
+        * soon anyway, so this is no particular loss.
+        */
+       if (IsRecoveryProcessingMode())
+               return;
+
        /*
         * We prune when a previous UPDATE failed to find enough space on the page
         * for a new tuple version, or when free space falls below the relation's
@@ -176,6 +185,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
         * Also initialize the rest of our working state.
         */
        prstate.new_prune_xid = InvalidTransactionId;
+       prstate.latestRemovedXid = InvalidTransactionId;
        prstate.nredirected = prstate.ndead = prstate.nunused = 0;
        memset(prstate.marked, 0, sizeof(prstate.marked));
 
@@ -258,7 +268,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
                                                                        prstate.redirected, prstate.nredirected,
                                                                        prstate.nowdead, prstate.ndead,
                                                                        prstate.nowunused, prstate.nunused,
-                                                                       redirect_move);
+                                                                       prstate.latestRemovedXid, redirect_move);
 
                        PageSetLSN(BufferGetPage(buffer), recptr);
                        PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
@@ -396,6 +406,8 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
                                == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
                        {
                                heap_prune_record_unused(prstate, rootoffnum);
+                               HeapTupleHeaderAdvanceLatestRemovedXid(htup, 
+                                                                                                          &prstate->latestRemovedXid);
                                ndeleted++;
                        }
 
@@ -521,7 +533,11 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
                 * find another DEAD tuple is a fairly unusual corner case.)
                 */
                if (tupdead)
+               {
                        latestdead = offnum;
+                       HeapTupleHeaderAdvanceLatestRemovedXid(htup, 
+                                                                                                  &prstate->latestRemovedXid);
+               }
                else if (!recent_dead)
                        break;
 
index 88baa7c90432ac5f32fb587b509fdcc74ca50445..fb2b06aa88add6d5d9269f118cd794cc0402c189 100644 (file)
@@ -89,8 +89,19 @@ RelationGetIndexScan(Relation indexRelation,
        else
                scan->keyData = NULL;
 
+       /*
+        * During recovery we ignore killed tuples and don't bother to kill them
+        * either. We do this because the xmin on the primary node could easily
+        * be later than the xmin on the standby node, so that what the primary
+        * thinks is killed is supposed to be visible on standby. So for correct
+        * MVCC for queries during recovery we must ignore these hints and check
+        * all tuples. Do *not* set ignore_killed_tuples to true when running
+        * in a transaction that was started during recovery. AMs can set it to
+        * false at any time. xactStartedInRecovery should not be touched by AMs.
+        */
        scan->kill_prior_tuple = false;
-       scan->ignore_killed_tuples = true;      /* default setting */
+       scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
+       scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
 
        scan->opaque = NULL;
 
index 92eec92babca01a37649d737bf5b598929db5fe0..09da208329deb1b20aaf99e9be7d890c42b9206f 100644 (file)
@@ -455,9 +455,12 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
 
                        /*
                         * If we scanned a whole HOT chain and found only dead tuples,
-                        * tell index AM to kill its entry for that TID.
+                        * tell index AM to kill its entry for that TID. We do not do
+                        * this when in recovery because it may violate MVCC to do so. 
+                        * see comments in RelationGetIndexScan().
                         */
-                       scan->kill_prior_tuple = scan->xs_hot_dead;
+                       if (!scan->xactStartedInRecovery)
+                               scan->kill_prior_tuple = scan->xs_hot_dead;
 
                        /*
                         * The AM's gettuple proc finds the next index entry matching the
index 81d56b3a6b804b9cc978a6d6ebcdfcf79e70d5df..aee8f8fe24b979b12eb0ceca001187ea48350bfa 100644 (file)
@@ -401,6 +401,27 @@ of the WAL entry.)  If the parent page becomes half-dead but is not
 immediately deleted due to a subsequent crash, there is no loss of
 consistency, and the empty page will be picked up by the next VACUUM.
 
+Scans during Recovery
+---------------------
+
+The btree index type can be safely used during recovery. During recovery
+we have at most one writer and potentially many readers. In that
+situation the locking requirements can be relaxed and we do not need
+double locking during block splits. Each WAL record makes changes to a 
+single level of the btree using the correct locking sequence and so
+is safe for concurrent readers. Some readers may observe a block split
+in progress as they descend the tree, but they will simple move right
+onto the correct page.
+
+During recovery all index scans start with ignore_killed_tuples = false
+and we never set kill_prior_tuple. We do this because the oldest xmin
+on the standby server can be older than the oldest xmin on the master 
+server, which means tuples can be marked as killed even when they are
+still visible on the standby. We don't WAL log tuple killed bits, but
+they can still appear in the standby because of full page writes. So
+we must always ignore them and that means it's not worth setting them
+either.
+
 Other Things That Are Handy to Know
 -----------------------------------
 
index 69a2ed3ec249daf279e86bbeacec1990d897a8d8..7b4ce9efda5699aab2c40853a560944bc0159d61 100644 (file)
@@ -1924,7 +1924,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
        }
 
        if (ndeletable > 0)
-               _bt_delitems(rel, buffer, deletable, ndeletable);
+               _bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
 
        /*
         * Note: if we didn't find any LP_DEAD items, then the page's
index 23026c2905e150a1ce528e7604cd58245b631dae..4632524eb2123c4ee67a031d42c45cdbef5667fb 100644 (file)
@@ -652,7 +652,8 @@ _bt_page_recyclable(Page page)
  */
 void
 _bt_delitems(Relation rel, Buffer buf,
-                        OffsetNumber *itemnos, int nitems)
+                        OffsetNumber *itemnos, int nitems, bool isVacuum,
+                        BlockNumber lastBlockVacuumed)
 {
        Page            page = BufferGetPage(buf);
        BTPageOpaque opaque;
@@ -684,15 +685,37 @@ _bt_delitems(Relation rel, Buffer buf,
        /* XLOG stuff */
        if (!rel->rd_istemp)
        {
-               xl_btree_delete xlrec;
                XLogRecPtr      recptr;
                XLogRecData rdata[2];
 
-               xlrec.node = rel->rd_node;
-               xlrec.block = BufferGetBlockNumber(buf);
+               /* We don't need both, but it simplies the code to have both here */
+               xl_btree_delete xlrec_delete; 
+               xl_btree_vacuum xlrec_vacuum;
+
+               if (isVacuum)
+               {
+                       xlrec_vacuum.node = rel->rd_node;
+                       xlrec_vacuum.block = BufferGetBlockNumber(buf);
+
+                       xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+                       rdata[0].data = (char *) &xlrec_vacuum;
+                       rdata[0].len = SizeOfBtreeVacuum;
+               }
+               else
+               {
+                       xlrec_delete.node = rel->rd_node;
+                       xlrec_delete.block = BufferGetBlockNumber(buf);
+
+                       /*
+                        * We would like to set an accurate latestRemovedXid, but there
+                        * is no easy way of obtaining a useful value. So we use the
+                        * probably far too conservative value of RecentGlobalXmin instead.
+                        */
+                       xlrec_delete.latestRemovedXid = InvalidTransactionId;
+                       rdata[0].data = (char *) &xlrec_delete;
+                       rdata[0].len = SizeOfBtreeDelete;
+               }
 
-               rdata[0].data = (char *) &xlrec;
-               rdata[0].len = SizeOfBtreeDelete;
                rdata[0].buffer = InvalidBuffer;
                rdata[0].next = &(rdata[1]);
 
@@ -715,7 +738,10 @@ _bt_delitems(Relation rel, Buffer buf,
                rdata[1].buffer_std = true;
                rdata[1].next = NULL;
 
-               recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
+               if (isVacuum)
+                       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
+               else
+                       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
 
                PageSetLSN(page, recptr);
                PageSetTLI(page, ThisTimeLineID);
index 59680cd0561fa403c9f5baeaab5715ab8dc546cb..b1a8a575de92ef5f3b4fcc9d1ab50f452aa60408 100644 (file)
@@ -58,7 +58,8 @@ typedef struct
        IndexBulkDeleteCallback callback;
        void       *callback_state;
        BTCycleId       cycleid;
-       BlockNumber lastUsedPage;
+       BlockNumber lastBlockVacuumed;  /* last blkno reached by Vacuum scan */
+       BlockNumber lastUsedPage;               /* blkno of last page that is in use */
        BlockNumber totFreePages;       /* true total # of free pages */
        MemoryContext pagedelcontext;
 } BTVacState;
@@ -626,6 +627,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
        vstate.callback = callback;
        vstate.callback_state = callback_state;
        vstate.cycleid = cycleid;
+       vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
        vstate.lastUsedPage = BTREE_METAPAGE;
        vstate.totFreePages = 0;
 
@@ -855,7 +857,19 @@ restart:
                 */
                if (ndeletable > 0)
                {
-                       _bt_delitems(rel, buf, deletable, ndeletable);
+                       BlockNumber     lastBlockVacuumed = BufferGetBlockNumber(buf);
+
+                       _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
+
+                       /*
+                        * Keep track of the block number of the lastBlockVacuumed, so
+                        * we can scan those blocks as well during WAL replay. This then
+                        * provides concurrency protection and allows btrees to be used
+                        * while in recovery.
+                        */
+                       if (lastBlockVacuumed > vstate->lastBlockVacuumed)
+                               vstate->lastBlockVacuumed = lastBlockVacuumed;
+
                        stats->tuples_removed += ndeletable;
                        /* must recompute maxoff */
                        maxoff = PageGetMaxOffsetNumber(page);
index 517c4b90cec9b9c9b9f5604a33108abf1c0e416a..02ff07c2ab79031e690c3e49412704f31ac9b5c8 100644 (file)
 
 #include "access/nbtree.h"
 #include "access/transam.h"
+#include "access/xact.h"
 #include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "utils/inval.h"
 
 /*
  * We must keep track of expected insertions due to page splits, and apply
@@ -458,6 +461,86 @@ btree_xlog_split(bool onleft, bool isroot,
                                                 xlrec->leftsib, xlrec->rightsib, isroot);
 }
 
+static void
+btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_btree_vacuum *xlrec;
+       Buffer          buffer;
+       Page            page;
+       BTPageOpaque opaque;
+
+       if (record->xl_info & XLR_BKP_BLOCK_1)
+               return;
+
+       xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+
+       /*
+        * We need to ensure every block is unpinned between the
+        * lastBlockVacuumed and the current block, if there are any.
+        * This ensures that every block in the index is touched during
+        * VACUUM as required to ensure scans work correctly.
+        */
+       if ((xlrec->lastBlockVacuumed + 1) != xlrec->block)
+       {
+               BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+
+               for (; blkno < xlrec->block; blkno++)
+               {
+                       /*
+                        * XXXHS we don't actually need to read the block, we
+                        * just need to confirm it is unpinned. If we had a special call
+                        * into the buffer manager we could optimise this so that
+                        * if the block is not in shared_buffers we confirm it as unpinned.
+                        */
+                       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+                       if (BufferIsValid(buffer))
+                       {
+                               LockBufferForCleanup(buffer);                   
+                   UnlockReleaseBuffer(buffer);
+                       }
+               }
+       }
+
+       /*
+        * We need to take a cleanup lock to apply these changes.
+        * See nbtree/README for details.
+        */
+       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
+       if (!BufferIsValid(buffer))
+               return;
+       LockBufferForCleanup(buffer);
+       page = (Page) BufferGetPage(buffer);
+
+       if (XLByteLE(lsn, PageGetLSN(page)))
+       {
+               UnlockReleaseBuffer(buffer);
+               return;
+       }
+
+       if (record->xl_len > SizeOfBtreeVacuum)
+       {
+               OffsetNumber *unused;
+               OffsetNumber *unend;
+
+               unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum);
+               unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+               PageIndexMultiDelete(page, unused, unend - unused);
+       }
+
+       /*
+        * Mark the page as not containing any LP_DEAD items --- see comments in
+        * _bt_delitems().
+        */
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+       PageSetLSN(page, lsn);
+       PageSetTLI(page, ThisTimeLineID);
+       MarkBufferDirty(buffer);
+       UnlockReleaseBuffer(buffer);
+}
+
 static void
 btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -470,6 +553,11 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
                return;
 
        xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+       /*
+        * We don't need to take a cleanup lock to apply these changes.
+        * See nbtree/README for details.
+        */
        buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
        if (!BufferIsValid(buffer))
                return;
@@ -714,6 +802,46 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
+       /*
+        * Btree delete records can conflict with standby queries. You might
+        * think that Vacuum records would conflict as well, but they don't
+        * because XLOG_HEAP2_CLEANUP_INFO exist specifically to ensure that
+        * we perform all conflict for the whole index, rather than block by
+        * block.
+        */
+       if (InArchiveRecovery)
+       {
+               if (info == XLOG_BTREE_DELETE)
+               {
+                       xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+                       if (RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid) &&
+                               LatestRemovedXidAdvances(xlrec->latestRemovedXid))
+                       {
+                               VirtualTransactionId *old_snapshots;
+
+                               /*
+                                * Make sure the incoming transaction is emulated as running
+                                * prior to allowing any changes made by it to touch data.
+                                *
+                                * XXXHS: Currently we put everybody on death row, because
+                                * currently _bt_delitems() supplies InvalidTransactionId. We
+                                * should be able to do better than that with some thought.
+                                */
+                               old_snapshots = GetConflictingVirtualXIDs(xlrec->latestRemovedXid, 
+                                                                                                                       xlrec->node.dbNode,
+                                                                                                                       InvalidOid);
+
+                               ResolveRecoveryConflictWithVirtualXIDs(old_snapshots,
+                                                                                                       "btree delete",
+                                                                                                                       ERROR,
+                                                                                                                       lsn);
+                       }
+               }
+               else
+                       (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+       }
+
        RestoreBkpBlocks(lsn, record, false);
 
        switch (info)
@@ -739,6 +867,9 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
                case XLOG_BTREE_SPLIT_R_ROOT:
                        btree_xlog_split(false, true, lsn, record);
                        break;
+               case XLOG_BTREE_VACUUM:
+                       btree_xlog_vacuum(lsn, record);
+                       break;
                case XLOG_BTREE_DELETE:
                        btree_xlog_delete(lsn, record);
                        break;
@@ -843,13 +974,24 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
                                                                 xlrec->level, xlrec->firstright);
                                break;
                        }
+               case XLOG_BTREE_VACUUM:
+                       {
+                               xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
+
+                               appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
+                                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                                xlrec->node.relNode, xlrec->block,
+                                                                xlrec->lastBlockVacuumed);
+                               break;
+                       }
                case XLOG_BTREE_DELETE:
                        {
                                xl_btree_delete *xlrec = (xl_btree_delete *) rec;
 
-                               appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u",
+                               appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
                                                                 xlrec->node.spcNode, xlrec->node.dbNode,
-                                                                xlrec->node.relNode, xlrec->block);
+                                                                xlrec->node.relNode, xlrec->block,
+                                                                xlrec->latestRemovedXid);
                                break;
                        }
                case XLOG_BTREE_DELETE_PAGE:
index a88563e3357d4a259592eefcce4d390c8063eb3d..9e32a72d97162949bd065f88407652df08390e9f 100644 (file)
@@ -195,10 +195,11 @@ they first do something that requires one --- typically, insert/update/delete
 a tuple, though there are a few other places that need an XID assigned.
 If a subtransaction requires an XID, we always first assign one to its
 parent.  This maintains the invariant that child transactions have XIDs later
-than their parents, which is assumed in a number of places.
+than their parents, which is assumed in a number of places. In 8.4 onwards,
+some corner cases exist that require XID assignment to be WAL logged.
 
 The subsidiary actions of obtaining a lock on the XID and and entering it into
-pg_subtrans and PG_PROC are done at the time it is assigned.
+PG_PROC and, in some cases, pg_subtrans are done at the time it is assigned.
 
 A transaction that has no XID still needs to be identified for various
 purposes, notably holding locks.  For this purpose we assign a "virtual
@@ -376,7 +377,9 @@ but since we allow arbitrary nesting of subtransactions, we can't fit all Xids
 in shared memory, so we have to store them on disk.  Note, however, that for
 each transaction we keep a "cache" of Xids that are known to be part of the
 transaction tree, so we can skip looking at pg_subtrans unless we know the
-cache has been overflowed.  See storage/ipc/procarray.c for the gory details.
+cache has been overflowed.  In 8.4 we skip updating pg_subtrans unless the 
+cache has overflowed for that transaction, considerably reducing pg_subtrans
+activity. See storage/ipc/procarray.c for the gory details.
 
 slru.c is the supporting mechanism for both pg_clog and pg_subtrans.  It
 implements the LRU policy for in-memory buffer pages.  The high-level routines
@@ -649,3 +652,33 @@ fsync it down to disk without any sort of interlock, as soon as it finishes
 the bulk update.  However, all these paths are designed to write data that
 no other transaction can see until after T1 commits.  The situation is thus
 not different from ordinary WAL-logged updates.
+
+Transaction Emulation during Recovery
+-------------------------------------
+
+During Recovery we replay transaction changes in the order they occurred.
+As part of this replay we emulate some transactional behaviour, so that
+read only backends can take MVCC snapshots. We do this by maintaining
+Recovery Procs, so that each transaction that has recorded WAL records for 
+database writes will exist in the procarray until it commits. Further
+details are given in comments in procarray.c.
+
+Many actions write no WAL records at all, for example read only transactions.
+These have no effect on MVCC in recovery and we can pretend they never
+occurred at all. Subtransaction commit does not write a WAL record either
+and has very little effect, since lock waiters need to wait for the
+parent transaction to complete.
+
+Not all transactional behaviour is emulated, for example we do not insert
+a transaction entry into the lock table, nor do we maintain the transaction
+stack in memory. Clog entries are made normally. Multitrans is not maintained 
+because its purpose is to record tuple level locks that an application has 
+requested to prevent write locks. Since write locks cannot be obtained at all,
+there is never any conflict and so there is no reason to update multitrans.
+Subtrans is maintained during recovery but the details of the transaction
+tree are ignored and all subtransactions reference the top-level TransactionId
+directly. Since commit is atomic this provides correct lock wait behaviour
+yet simplifies emulation of subtransactions considerably.
+
+Further details on locking mechanics in recovery are given in comments
+with the Lock rmgr code.
index 5bd72154c56c726ffbb5640613c585d72a7b7be3..46e05596cd9e2ea4052b46e876223a682cdf1dc6 100644 (file)
@@ -35,6 +35,7 @@
 #include "access/clog.h"
 #include "access/slru.h"
 #include "access/transam.h"
+#include "access/xact.h"
 #include "pg_trace.h"
 #include "postmaster/bgwriter.h"
 
@@ -690,6 +691,9 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
        /* Backup blocks are not used in clog records */
        Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
        if (info == CLOG_ZEROPAGE)
        {
                int                     pageno;
index 881a588d69c0aca34283f64e24061ceb708b0da2..f33c7fa91d030d5e4cffcb75840d99a61e84b710 100644 (file)
@@ -1544,6 +1544,7 @@ CheckPointMultiXact(void)
         * isn't valid (because StartupMultiXact hasn't been called yet) and so
         * SimpleLruTruncate would get confused.  It seems best not to risk
         * removing any data during recovery anyway, so don't truncate.
+        * We are executing in the bgwriter, so we must access shared status.
         */
        if (!IsRecoveryProcessingMode())
                TruncateMultiXact();
@@ -1875,6 +1876,9 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
        /* Backup blocks are not used in multixact records */
        Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
        if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
        {
                int                     pageno;
index 0273b0e153c108fcc500e4f4f7bf1f9bb2086574..252f4ee3f82351549c4e33530525e039bd8905d4 100644 (file)
@@ -20,6 +20,7 @@
 #include "commands/dbcommands.h"
 #include "commands/sequence.h"
 #include "commands/tablespace.h"
+#include "storage/sinval.h"
 #include "storage/freespace.h"
 
 
@@ -32,7 +33,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
        {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
        {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
        {"Reserved 7", NULL, NULL, NULL, NULL, NULL},
-       {"Reserved 8", NULL, NULL, NULL, NULL, NULL},
+       {"Relation", relation_redo, relation_desc, NULL, NULL, NULL},
        {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
        {"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
        {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
index 68e38696fb5bfe2f8437d127ff273c4cde77170b..f337e18b0e80419d8975d5b72aa559e23c8202ec 100644 (file)
@@ -598,7 +598,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
         * commands to set the commit status of transactions whose bits are in
         * already-truncated segments of the commit log (see notes in
         * SlruPhysicalWritePage).      Hence, if we are InRecovery, allow the case
-        * where the file doesn't exist, and return zeroes instead.
+        * where the file doesn't exist, and return zeroes instead. We also
+        * return a zeroed page when seek and read fails. 
         */
        fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
        if (fd < 0)
@@ -619,6 +620,14 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 
        if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
        {
+               if (InRecovery)
+               {
+                       ereport(LOG,
+                                       (errmsg("file \"%s\" doesn't exist, reading as zeroes",
+                                                       path)));
+                       MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+                       return true;
+               }
                slru_errcause = SLRU_SEEK_FAILED;
                slru_errno = errno;
                close(fd);
@@ -628,6 +637,14 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
        errno = 0;
        if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
        {
+               if (InRecovery)
+               {
+                       ereport(LOG,
+                                       (errmsg("file \"%s\" doesn't exist, reading as zeroes",
+                                                       path)));
+                       MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+                       return true;
+               }
                slru_errcause = SLRU_READ_FAILED;
                slru_errno = errno;
                close(fd);
index eaad23182af537309c09d0d1f82f2619abe073fd..fe57e61024fe6a615d477a5a523b48b7f7940c56 100644 (file)
@@ -31,6 +31,7 @@
 #include "access/slru.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
+#include "miscadmin.h"
 #include "pg_trace.h"
 #include "utils/snapmgr.h"
 
@@ -223,36 +224,19 @@ ZeroSUBTRANSPage(int pageno)
 /*
  * This must be called ONCE during postmaster or standalone-backend startup,
  * after StartupXLOG has initialized ShmemVariableCache->nextXid.
- *
- * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
- * if there are none.
- *
- * Note that this is not atomic and is not yet safe to perform while other
- * processes might access subtrans.
  */
 void
 StartupSUBTRANS(TransactionId oldestActiveXID)
 {
-       int                     startPage;
-       int                     endPage;
+       TransactionId xid = ShmemVariableCache->nextXid;
+       int                     pageno = TransactionIdToPage(xid);
 
-       /*
-        * Since we don't expect pg_subtrans to be valid across crashes, we
-        * initialize the currently-active page(s) to zeroes during startup.
-        * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
-        * the new page without regard to whatever was previously on disk.
-        */
        LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
-       startPage = TransactionIdToPage(oldestActiveXID);
-       endPage = TransactionIdToPage(ShmemVariableCache->nextXid);
-
-       while (startPage != endPage)
-       {
-               (void) ZeroSUBTRANSPage(startPage);
-               startPage++;
-       }
-       (void) ZeroSUBTRANSPage(startPage);
+       /*
+        * Initialize our idea of the latest page number.
+        */
+       SubTransCtl->shared->latest_page_number = pageno;
 
        LWLockRelease(SubtransControlLock);
 }
@@ -305,16 +289,55 @@ void
 ExtendSUBTRANS(TransactionId newestXact)
 {
        int                     pageno;
+       static int last_pageno = 0;
 
-       /*
-        * No work except at first XID of a page.  But beware: just after
-        * wraparound, the first XID of page zero is FirstNormalTransactionId.
-        */
-       if (TransactionIdToEntry(newestXact) != 0 &&
-               !TransactionIdEquals(newestXact, FirstNormalTransactionId))
-               return;
+       Assert(TransactionIdIsNormal(newestXact));
 
-       pageno = TransactionIdToPage(newestXact);
+       if (!InRecovery)
+       {
+               /*
+                * No work except at first XID of a page.  But beware: just after
+                * wraparound, the first XID of page zero is FirstNormalTransactionId.
+                */
+               if (TransactionIdToEntry(newestXact) != 0 &&
+                       !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+                       return;
+
+               pageno = TransactionIdToPage(newestXact);
+       }
+       else
+       {
+               int32           diff;
+
+               /*
+                * InRecovery we keep track of the last page we extended, so
+                * we can compare that against incoming XIDs. This will only
+                * ever be run by startup process, so keep it as a static variable
+                * rather than hiding behind the SubtransControlLock.
+                */
+               pageno = TransactionIdToPage(newestXact);
+
+               /*
+                * Fast path return for common case
+                */
+               if (pageno == last_pageno)
+                       return;         
+
+               /*
+                * If pageno logically precedes last_pageno then we do nothing.
+                * We need to be careful at wraparound here too, so we do a 
+                * modulo-2^31 comparison, exactly as we do in TransactionIdPrecedes()
+                */
+               diff = (int32) (pageno - last_pageno);
+               if (diff < 0)
+                       return;
+
+               elog(trace_recovery(DEBUG1), 
+                                               "extend subtrans  xid %u page %d last_page %d",
+                                               newestXact, pageno, last_pageno);
+
+               last_pageno = pageno;
+       }
 
        LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
index 2a1eab4d16c0a9b3a899340aa456cf80e9a9531b..6fb2d3f7296c797c83b2d2eb3e61421fc8b86fc9 100644 (file)
@@ -35,9 +35,6 @@ static TransactionId cachedFetchXid = InvalidTransactionId;
 static XidStatus cachedFetchXidStatus;
 static XLogRecPtr cachedCommitLSN;
 
-/* Handy constant for an invalid xlog recptr */
-static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
-
 /* Local functions */
 static XidStatus TransactionLogFetch(TransactionId transactionId);
 
index eb3f34183f49e5c6f357b0f743d05c69c1b92a90..e5d6a4265ac2fbbdfefb8a22b0a8e74c022cb1fa 100644 (file)
@@ -359,7 +359,7 @@ MarkAsPrepared(GlobalTransaction gxact)
         * Put it into the global ProcArray so TransactionIdIsInProgress considers
         * the XID as still running.
         */
-       ProcArrayAdd(&gxact->proc);
+       ProcArrayAdd(&gxact->proc, true);
 }
 
 /*
@@ -1198,7 +1198,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
                                                                           hdr->nsubxacts, children,
                                                                           hdr->nabortrels, abortrels);
 
-       ProcArrayRemove(&gxact->proc, latestXid);
+       ProcArrayRemove(&gxact->proc, latestXid, 0, NULL);
 
        /*
         * In case we fail while running the callbacks, mark the gxact invalid so
@@ -1719,6 +1719,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
        /* Emit the XLOG commit record */
        xlrec.xid = xid;
        xlrec.crec.xact_time = GetCurrentTimestamp();
+       xlrec.crec.xinfo = 0;
        xlrec.crec.nrels = nrels;
        xlrec.crec.nsubxacts = nchildren;
        rdata[0].data = (char *) (&xlrec);
index 16a75346e8b2d7ebcca30baacdc256c83d0427a1..4c1550508c05b149d8dc52c8fa70fd5628940fc8 100644 (file)
@@ -277,6 +277,16 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
        curXid = ShmemVariableCache->nextXid;
        LWLockRelease(XidGenLock);
 
+       /*
+        * If we are in recovery then we are just replaying what has happened on
+        * the master. If we do need to trigger an autovacuum then it will happen
+        * on the master and changes will be fed through to the standby.
+        * So we have nothing to do here but be patient. We may be called during
+        * recovery by Startup process when updating db flat files.
+        */
+       if (InRecovery)
+               return;
+
        /* Log the info */
        ereport(DEBUG1,
           (errmsg("transaction ID wrap limit is %u, limited by database \"%s\"",
index d0ed3c0318dedd9371a2a06ac40937964483df54..ceb150c85d1ce0b70558ce4507ac9fa4b0d2f052 100644 (file)
@@ -40,6 +40,7 @@
 #include "storage/fd.h"
 #include "storage/lmgr.h"
 #include "storage/procarray.h"
+#include "storage/sinval.h"
 #include "storage/sinvaladt.h"
 #include "storage/smgr.h"
 #include "utils/combocid.h"
@@ -141,6 +142,7 @@ typedef struct TransactionStateData
        Oid                     prevUser;               /* previous CurrentUserId setting */
        bool            prevSecDefCxt;  /* previous SecurityDefinerContext setting */
        bool            prevXactReadOnly;               /* entry-time xact r/o state */
+       bool            startedInRecovery;      /* did we start in recovery? */
        struct TransactionStateData *parent;            /* back link to parent */
 } TransactionStateData;
 
@@ -169,6 +171,7 @@ static TransactionStateData TopTransactionStateData = {
        InvalidOid,                                     /* previous CurrentUserId setting */
        false,                                          /* previous SecurityDefinerContext setting */
        false,                                          /* entry-time xact r/o state */
+       false,                                          /* startedInRecovery */
        NULL                                            /* link to parent state block */
 };
 
@@ -211,6 +214,17 @@ static bool forceSyncCommit = false;
  */
 static MemoryContext TransactionAbortContext = NULL;
 
+/*
+ * Bookkeeping for tracking emulated transactions in Recovery Procs.
+ */
+static TransactionId   latestObservedXid = InvalidTransactionId;
+static bool                            RunningXactIsValid;
+
+/*
+ * Local state to optimise recovery conflict resolution
+ */
+static TransactionId   latestRemovedXid = InvalidTransactionId;
+
 /*
  * List of add-on start- and end-of-xact callbacks
  */
@@ -237,7 +251,7 @@ static SubXactCallbackItem *SubXact_callbacks = NULL;
 
 
 /* local function prototypes */
-static void AssignTransactionId(TransactionState s);
+static void AssignTransactionId(TransactionState s, int recursion_level);
 static void AbortTransaction(void);
 static void AtAbort_Memory(void);
 static void AtCleanup_Memory(void);
@@ -331,7 +345,7 @@ TransactionId
 GetTopTransactionId(void)
 {
        if (!TransactionIdIsValid(TopTransactionStateData.transactionId))
-               AssignTransactionId(&TopTransactionStateData);
+               AssignTransactionId(&TopTransactionStateData, 0);
        return TopTransactionStateData.transactionId;
 }
 
@@ -361,7 +375,7 @@ GetCurrentTransactionId(void)
        TransactionState s = CurrentTransactionState;
 
        if (!TransactionIdIsValid(s->transactionId))
-               AssignTransactionId(s);
+               AssignTransactionId(s, 0);
        return s->transactionId;
 }
 
@@ -389,7 +403,7 @@ GetCurrentTransactionIdIfAny(void)
  * following its parent's.
  */
 static void
-AssignTransactionId(TransactionState s)
+AssignTransactionId(TransactionState s, int recursion_level)
 {
        bool            isSubXact = (s->parent != NULL);
        ResourceOwner currentOwner;
@@ -406,7 +420,7 @@ AssignTransactionId(TransactionState s)
         * than its parent.
         */
        if (isSubXact && !TransactionIdIsValid(s->parent->transactionId))
-               AssignTransactionId(s->parent);
+               AssignTransactionId(s->parent, recursion_level + 1);
 
        /*
         * Generate a new Xid and record it in PG_PROC and pg_subtrans.
@@ -418,7 +432,14 @@ AssignTransactionId(TransactionState s)
         */
        s->transactionId = GetNewTransactionId(isSubXact);
 
-       if (isSubXact)
+       /*
+        * If we have overflowed the subxid cache then we must mark subtrans
+        * with the parent xid. Prior to 8.4 we marked subtrans for each
+        * subtransaction, though that is no longer necessary because the 
+        * way snapshots are searched in XidInMVCCSnapshot() has changed to
+        * allow searching of both subxid cache and subtrans, not either/or.
+        */
+       if (isSubXact && MyProc->subxids.overflowed)
                SubTransSetParent(s->transactionId, s->parent->transactionId);
 
        /*
@@ -440,8 +461,61 @@ AssignTransactionId(TransactionState s)
        }
        PG_END_TRY();
        CurrentResourceOwner = currentOwner;
-}
 
+       /*
+        * Recovery environment needs to know when a transaction first starts
+        * making changes to the database. We could issue an assignment WAL
+        * record for every transaction and subtransaction but that would be
+        * a large performance hit. However, each WAL record is marked with 
+        * both it's xid and its top-level xid. So we only need to issue an
+        * assignment xid when we are assigning multiple xids recursively,
+        * except for when we are on the very first subtransaction in any
+        * transaction - since that already has xid and topxid on it.
+        */
+       if (recursion_level > 1 || (recursion_level == 1 && isSubXact))
+       {
+               XLogRecData rdata;
+               xl_xact_assignment      xlrec;
+
+               xlrec.xassign = s->transactionId;
+
+               if (isSubXact)
+                       xlrec.xtop = s->parent->transactionId;
+               else
+                       xlrec.xtop = InvalidTransactionId;
+
+               elog(trace_recovery(DEBUG2), 
+                               "AssignTransactionId xid %u xtop %u nest %d recursion %d hasParent %s",
+                               xlrec.xassign,
+                               xlrec.xtop,
+                               GetCurrentTransactionNestLevel(),
+                               recursion_level,
+                               isSubXact ? "t" : "f");
+
+               START_CRIT_SECTION();
+
+               rdata.data = (char *) (&xlrec);
+               rdata.len = sizeof(xl_xact_assignment);
+               rdata.buffer = InvalidBuffer;
+               rdata.next = NULL;
+
+               /* 
+                * These WAL records look like no other. We are assigning a 
+                * TransactionId to upper levels of the transaction stack. The
+                * transaction level we are looking is *not* be the *current*
+                * transaction - we haven't even assigned the xid for the current
+                * transaction yet, so the xl_xid of this WAL record will be 
+                * InvalidTransactionId, even though we are in a transaction.
+                * Got that?
+                * 
+                * So we stuff the newly assigned xid into the body of the WAL 
+                * record and let RecordKnownAssignedTransactionIds() work it out.
+                */
+               (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, &rdata);
+
+               END_CRIT_SECTION();
+       }
+}
 
 /*
  *     GetCurrentSubTransactionId
@@ -600,6 +674,16 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
        return false;
 }
 
+/*
+ *     TransactionStartedDuringRecovery, used during index scans
+ */
+bool
+TransactionStartedDuringRecovery(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       return s->startedInRecovery;
+}
 
 /*
  *     CommandCounterIncrement
@@ -827,11 +911,15 @@ RecordTransactionCommit(void)
        bool            haveNonTemp;
        int                     nchildren;
        TransactionId *children;
+       int                     nmsgs;
+       SharedInvalidationMessage *invalidationMessages = NULL;
+       bool            RelcacheInitFileInval;
 
        /* Get data needed for commit record */
        nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
        nchildren = xactGetCommittedChildren(&children);
-
+       nmsgs = xactGetCommittedInvalidationMessages(&invalidationMessages, 
+                                                                                                &RelcacheInitFileInval);
        /*
         * If we haven't been assigned an XID yet, we neither can, nor do we want
         * to write a COMMIT record.
@@ -865,13 +953,26 @@ RecordTransactionCommit(void)
                /*
                 * Begin commit critical section and insert the commit XLOG record.
                 */
-               XLogRecData rdata[3];
+               XLogRecData rdata[4];
                int                     lastrdata = 0;
                xl_xact_commit xlrec;
 
                /* Tell bufmgr and smgr to prepare for commit */
                BufmgrCommit();
 
+               /*
+                * Set flags required for recovery processing of commits.
+                * Nothing too critical here that we would want to include this
+                * within the critical section following.
+                */
+               xlrec.xinfo = 0;
+               if (AtEOXact_Database_FlatFile_Update_Needed())
+                       xlrec.xinfo |= XACT_COMPLETION_UPDATE_DB_FILE;
+               if (AtEOXact_Auth_FlatFile_Update_Needed())
+                       xlrec.xinfo |= XACT_COMPLETION_UPDATE_AUTH_FILE;
+               if (RelcacheInitFileInval)
+                       xlrec.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE;
+
                /*
                 * Mark ourselves as within our "commit critical section".      This
                 * forces any concurrent checkpoint to wait until we've updated
@@ -896,6 +997,8 @@ RecordTransactionCommit(void)
                xlrec.xact_time = xactStopTimestamp;
                xlrec.nrels = nrels;
                xlrec.nsubxacts = nchildren;
+               xlrec.nmsgs = nmsgs;
+
                rdata[0].data = (char *) (&xlrec);
                rdata[0].len = MinSizeOfXactCommit;
                rdata[0].buffer = InvalidBuffer;
@@ -917,6 +1020,15 @@ RecordTransactionCommit(void)
                        rdata[2].buffer = InvalidBuffer;
                        lastrdata = 2;
                }
+               /* dump shared cache invalidation messages */
+               if (nmsgs > 0)
+               {
+                       rdata[lastrdata].next = &(rdata[3]);
+                       rdata[3].data = (char *) invalidationMessages;
+                       rdata[3].len = nmsgs * sizeof(SharedInvalidationMessage);
+                       rdata[3].buffer = InvalidBuffer;
+                       lastrdata = 3;
+               }
                rdata[lastrdata].next = NULL;
 
                (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
@@ -1528,6 +1640,7 @@ StartTransaction(void)
        s->childXids = NULL;
        s->nChildXids = 0;
        s->maxChildXids = 0;
+       s->startedInRecovery = IsRecoveryProcessingMode();
        GetUserIdAndContext(&s->prevUser, &s->prevSecDefCxt);
        /* SecurityDefinerContext should never be set outside a transaction */
        Assert(!s->prevSecDefCxt);
@@ -4216,32 +4329,439 @@ xactGetCommittedChildren(TransactionId **ptr)
        return s->nChildXids;
 }
 
+/*
+ * Record an enhanced snapshot of running transactions into WAL.
+ */
+void
+LogCurrentRunningXacts(void)
+{
+       RunningTransactions             CurrRunningXacts = GetRunningTransactionData();
+       xl_xact_running_xacts   xlrec;
+       XLogRecData                     rdata[3];
+       int                                             lastrdata = 0;
+       XLogRecPtr                              recptr;
+
+       xlrec.xcnt = CurrRunningXacts->xcnt;
+       xlrec.subxcnt = CurrRunningXacts->subxcnt;
+       xlrec.latestRunningXid = CurrRunningXacts->latestRunningXid;
+       xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
+       xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+       /* Header */
+       rdata[0].data = (char *) (&xlrec);
+       rdata[0].len = MinSizeOfXactRunningXacts;
+       rdata[0].buffer = InvalidBuffer;
+
+       /* array of RunningXact */
+       if (xlrec.xcnt > 0)
+       {
+               rdata[0].next = &(rdata[1]);
+               rdata[1].data = (char *) CurrRunningXacts->xrun;
+               rdata[1].len = xlrec.xcnt * sizeof(RunningXact);
+               rdata[1].buffer = InvalidBuffer;
+               lastrdata = 1;
+       }
+
+       /* array of RunningXact */
+       if (xlrec.subxcnt > 0)
+       {
+               rdata[lastrdata].next = &(rdata[2]);
+               rdata[2].data = (char *) CurrRunningXacts->subxip;
+               rdata[2].len = xlrec.subxcnt * sizeof(TransactionId);
+               rdata[2].buffer = InvalidBuffer;
+               lastrdata = 2;
+       }
+
+       rdata[lastrdata].next = NULL;
+
+       START_CRIT_SECTION();
+
+       recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_RUNNING_XACTS, rdata);
+
+       END_CRIT_SECTION();
+
+       elog(trace_recovery(DEBUG2), "captured snapshot of running xacts %X/%X", recptr.xlogid, recptr.xrecoff);
+}
+
+/*
+ * Is the data available to allow valid snapshots?
+ */
+bool 
+IsRunningXactDataValid(void)
+{
+       return RunningXactIsValid;
+}
+
+void
+SetRunningXactData(bool mode)
+{
+       RunningXactIsValid = mode;
+}
+
+/*
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make lock table
+ * inserts to appear like a transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+void
+InitRecoveryTransactionEnvironment(void)
+{
+       VirtualTransactionId vxid;
+
+       /*
+        * Initialise shared invalidation management for Startup process,
+        * being careful to register ourselves as a sendOnly process so
+        * we don't need to read messages, nor will we get signalled
+        * when the queue starts filling up.
+        */
+       SharedInvalBackendInit(true);
+
+       /*
+        * Additional initialisation tasks. Most of this was performed
+        * during initial stages of startup.
+        */
+       ProcArrayInitRecoveryEnvironment();
+
+       /*
+        * Lock a virtual transaction id for Startup process.
+        *
+        * We need to do GetNextLocalTransactionId() because 
+        * SharedInvalBackendInit() leaves localTransactionid invalid and
+        * the lock manager doesn't like that at all.
+        *
+        * Note that we don't need to run XactLockTableInsert() because nobody
+        * needs to wait on xids. That sounds a little strange, but table locks
+        * are held by vxids and row level locks are held by xids. All queries 
+        * hold AccessShareLocks so never block while we write or lock new rows.
+        */
+       vxid.backendId = MyBackendId;
+       vxid.localTransactionId = GetNextLocalTransactionId();
+       VirtualXactLockTableInsert(vxid);
+
+       /*
+        * Now that the database is consistent we can create a valid copy of
+        * the flat files required for connection and authentication. This
+        * may already have been executed at appropriate commit points, but
+        * we cannot trust that those executions were correct, so force it
+        * again now just to be safe.
+        */
+       BuildFlatFiles(false);
+}
+
+/*
+ * During recovery we maintain ProcArray with incoming xids when we first 
+ * observe them in use. Uses local variables, so should only be called 
+ * by Startup process.
+ *
+ * We record all xids that we know have been assigned. That includes
+ * all the xids on the WAL record, plus all unobserved xids that
+ * we can deduce have been assigned. We can deduce the existence of
+ * unobserved xids because we know xids are in sequence, with no gaps.
+ */
+bool
+RecordKnownAssignedTransactionIds(XLogRecPtr lsn, TransactionId top_xid, TransactionId child_xid)
+{
+       TransactionId   xid;
+       PGPROC                  *proc;
+       bool                    unobserved = false;
+       bool                    mark_subtrans = false;
+
+       /*
+        * Skip processing if the current snapshot is invalid. If you're
+        * thinking of removing this, think again. We must have a valid
+        * initial state before we try to modify it.
+        */
+       if (!IsRunningXactDataValid())
+               return false;
+
+       xid = child_xid;
+       if (child_xid == top_xid)
+               child_xid = InvalidTransactionId;
+
+       /*
+        * VACUUM records are always sent with InvalidTransactionId, so
+        * invoke conflict processing if we see a record like this.
+        */
+       if (!TransactionIdIsValid(top_xid))
+               return true;
+
+       /*
+        * Identify the recovery proc that holds replay info for this xid.
+        *
+        * XXXHS This gets called for every WAL record (with XID). I think we'll
+        * need a faster version of BackendiXidGetProc, using a hash table or
+        * something. FWIW, the hash table wouldn't need to be in shared memory,
+        * because the startup process is the only one doing this.
+        */
+       proc = BackendXidGetProc(top_xid);
+
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       if (proc == NULL)
+       {
+               proc = CreateRecoveryProcessForTransactionId(top_xid);
+
+               if (proc == NULL)
+               {
+                       LWLockRelease(ProcArrayLock);
+                       SetRunningXactData(false);
+                       return false;
+               }
+
+               unobserved = true;
+       }
+
+       /*
+        * Use volatile pointer to prevent code rearrangement; other backends
+        * could be examining the subxid info concurrently, and we don't want
+        * them to see an invalid intermediate state, such as incrementing
+        * nxids before filling the array entry.  Note we are assuming that
+        * TransactionId and int fetch/store are atomic, but that's OK since
+        * we're holding ProcArrayLock exclusively.
+        */
+       {
+               volatile PGPROC *myproc = proc;
+
+               myproc->lsn = lsn;
+
+               if (TransactionIdIsValid(child_xid))
+               {
+                       int                     nxids = myproc->subxids.nxids;
+
+                       if (nxids < PGPROC_MAX_CACHED_SUBXIDS)
+                       {
+                               /* 
+                                * Just remember when reading this logic that by definition
+                                * we have Assert(TransactionIdPrecedes(top_xid, xid)) 
+                                */
+                               if (nxids > 0 && TransactionIdPrecedes(myproc->subxids.xids[nxids - 1], child_xid))
+                               {
+                                       myproc->subxids.xids[nxids] = child_xid;
+                                       myproc->subxids.nxids = nxids + 1;
+                               }
+                       }
+                       else
+                       {
+                               myproc->subxids.overflowed = true;
+                               mark_subtrans = true;
+                       }
+               }
+       }
+
+       /*
+        * When a newly observed xid arrives, it is frequently the case
+        * that it is *not* the next xid in sequence. When this occurs, we
+        * must treat the intervening xids as running also. So we maintain
+        * a special list of these UnobservedXids, so that snapshots can
+        * see the missing xids as in-progress.
+        *
+        * We maintain both recovery Procs *and* UnobservedXids because we
+        * need them both. Recovery procs allow us to store top-level xids
+        * and subtransactions separately, otherwise we wouldn't know
+        * when to overflow the subxid cache. UnobservedXids allow us to
+        * make sense of the out-of-order arrival of xids.
+        *
+        * Some examples:
+        * 1)   latestObservedXid = 647
+        *              next xid observed in WAL = 651 (a top-level transaction)
+        *              so we add 648, 649, 650 to UnobservedXids
+        *              and add 651 as a recovery proc
+        *
+        * 2)   latestObservedXid = 769
+        *              next xid observed in WAL = 771 (a subtransaction)
+        *              so we add 770 to UnobservedXids
+        *              and add 771 into the subxid cache of its top-level xid
+        *
+        * 3)   latestObservedXid = 769
+        *              next xid observed in WAL = 810 (a subtransaction)
+        *              810's parent had not yet recorded WAL = 807
+        *              so we add 770 thru 809 inclusive to UnobservedXids
+        *              then remove 807
+        *
+        * 4)   latestObservedXid = 769
+        *              next xid observed in WAL = 771 (a subtransaction)
+        *              771's parent had not yet recorded WAL = 770
+        *              so do nothing
+        *
+        * 5)   latestObservedXid = 7747
+        *              next xid observed in WAL = 7748 (a subtransaction)
+        *              7748's parent had not yet recorded WAL = 7742
+        *              so we add 7748 and removed 7742
+        */
+       for (xid = top_xid; TransactionIdIsValid(xid); xid = child_xid)
+       {
+               TransactionId   next_expected_xid = latestObservedXid;
+               TransactionIdAdvance(next_expected_xid);
+
+               if (next_expected_xid == xid)
+               {
+                       Assert(!XidInUnobservedTransactions(xid));
+                       latestObservedXid = xid;
+               }
+               else if (TransactionIdPrecedes(next_expected_xid, xid))
+               {
+                       UnobservedTransactionsAddXids(next_expected_xid, xid);
+                       latestObservedXid = xid;
+               }
+               else if (unobserved)
+                       UnobservedTransactionsRemoveXid(xid, true);
+
+               if (xid == child_xid)
+                       break;
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       elog(trace_recovery(DEBUG4), 
+                                       "record known xact top_xid %u child_xid %u %slatestObservedXid %u",
+                                       top_xid, child_xid,
+                                       (unobserved ? "unobserved " : " "),
+                                       latestObservedXid);
+
+       /* 
+        * Now we've upated the proc we can update subtrans, if appropriate.  
+        * We must do this step last to avoid race conditions.  See comments
+        * and code for AssignTransactionId().
+        *
+        * Notice that we update pg_subtrans with the top-level xid, rather
+        * than the parent xid. This is a difference between normal 
+        * processing and recovery, yet is still correct in all cases. The
+        * reason is that subtransaction commit is not marked in clog until
+        * commit processing, so all aborted subtransactions have already been
+        * clearly marked in clog. As a result we are able to refer directly
+        * to the top-level transaction's state rather than skipping through
+        * all the intermediate states in the subtransaction tree.
+        */
+       if (mark_subtrans)
+       {
+               elog(trace_recovery(DEBUG2), 
+                               "subtrans setting topxid %d for xid %d", top_xid, child_xid);
+               ExtendSUBTRANS(child_xid);
+               SubTransSetParent(child_xid, top_xid);
+       }
+
+       return true;
+}
+
+/*
+ * LatestRemovedXidAdvances - returns true if latestRemovedXid is moved
+ *                                                             forwards by the latest provided value
+ */
+bool
+LatestRemovedXidAdvances(TransactionId latestXid)
+{
+       /*
+        * Don't bother checking for conflicts for cleanup records earlier than
+        * we have already tested for. 
+        */
+       if (TransactionIdIsValid(latestRemovedXid) &&
+               TransactionIdPrecedes(latestRemovedXid, latestXid))
+               return false;
+
+       /*
+        * Remember how far we've cleaned to avoid checks in the future.
+        */
+       latestRemovedXid = latestXid;
+
+       return true;
+}
+
 /*
  *     XLOG support routines
  */
 
+/*
+ * Before 8.4 this was a fairly short function, but now it performs many
+ * actions for which the order of execution is critical.
+ */
 static void
-xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
+xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, bool preparedXact)
 {
        TransactionId *sub_xids;
        TransactionId max_xid;
+       PGPROC     *proc;
        int                     i;
 
-       /* Mark the transaction committed in pg_clog */
-       sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-       TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
-
        /* Make sure nextXid is beyond any XID mentioned in the record */
        max_xid = xid;
+       sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+       /*
+        * Find the highest xid and remove unobserved xids if required.
+        */
        for (i = 0; i < xlrec->nsubxacts; i++)
        {
                if (TransactionIdPrecedes(max_xid, sub_xids[i]))
                        max_xid = sub_xids[i];
        }
+
+       /* Mark the transaction committed in pg_clog */
+       TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
+
+       if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL)
+       {
+               /*
+                * We must mark clog before we update the ProcArray. Only update
+                * if we have already initialised the state and we have previously
+                * added an xid to the proc. We need no lock to check xid since it 
+                * is controlled by Startup process. It's possible for xids to
+                * appear that haven't been seen before. We don't need to check
+                * UnobservedXids because in the normal case this will already have
+                * happened, but there are cases where they might sneak through.
+                * Leave these for the periodic cleanup by XACT_RUNNING_XACT records.
+                */
+               if (IsRunningXactDataValid() && !preparedXact)
+               {
+                       ProcArrayRemove(proc, InvalidTransactionId, xlrec->nsubxacts, sub_xids);
+                       FreeRecoveryProcess(proc);
+               }
+
+               /*
+                * If requested, update the flat files for DB and Auth Files by
+                * reading the catalog tables. Needs to be the first action taken
+                * after marking transaction complete to minimise race conditions.
+                * This is the opposite way round to the original actions, which
+                * update the files and then mark committed, so there is a race
+                * condition in both places.
+                */
+               if (XactCompletionUpdateDBFile(xlrec) || XactCompletionUpdateAuthFile(xlrec))
+               {
+                       if (XactCompletionUpdateAuthFile(xlrec))
+                               BuildFlatFiles(false);
+                       else
+                               BuildFlatFiles(true);
+               }
+
+               /*
+                * Send any cache invalidations attached to the commit. We must
+                * maintain the same order of invalidation then release locks
+                * as occurs in RecordTransactionCommit.
+                */
+               if (xlrec->nmsgs > 0)
+               {
+                       int     offset = OffsetSharedInvalInXactCommit();
+                       SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+                                                       (((char *) xlrec) + offset);
+
+                       SendSharedInvalidMessages(msgs, xlrec->nmsgs);
+               }
+
+               /*
+                * Release locks, if any.
+                */
+               RelationReleaseRecoveryLockTree(xid, xlrec->nsubxacts, sub_xids);
+       }
+
+       /* Make sure nextXid is beyond any XID mentioned in the record */
        if (TransactionIdFollowsOrEquals(max_xid,
                                                                         ShmemVariableCache->nextXid))
        {
                ShmemVariableCache->nextXid = max_xid;
+               ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
                TransactionIdAdvance(ShmemVariableCache->nextXid);
        }
 
@@ -4263,28 +4783,65 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
        }
 }
 
+/*
+ * Be careful with the order of execution, as with xact_redo_commit().
+ * The two functions are similar but differ in key places.
+ */
 static void
-xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
+xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid, bool preparedXact)
 {
+       PGPROC          *proc = NULL;
        TransactionId *sub_xids;
        TransactionId max_xid;
        int                     i;
 
-       /* Mark the transaction aborted in pg_clog */
-       sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-       TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
-
        /* Make sure nextXid is beyond any XID mentioned in the record */
        max_xid = xid;
+       sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+       /*
+        * Find the highest xid and remove unobserved xids if required.
+        */
        for (i = 0; i < xlrec->nsubxacts; i++)
        {
                if (TransactionIdPrecedes(max_xid, sub_xids[i]))
                        max_xid = sub_xids[i];
        }
+
+       /* Mark the transaction aborted in pg_clog */
+       TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
+
+       if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL)
+       {
+               /*
+                * We must mark clog before we update the ProcArray. Only update
+                * if we have already initialised the state and we have previously
+                * added an xid to the proc. We need no lock to check xid since it 
+                * is controlled by Startup process. It's possible for xids to
+                * appear that haven't been seen before. We don't need to check
+                * UnobservedXids because in the normal case this will already have
+                * happened, but there are cases where they might sneak through.
+                * Leave these for the periodic cleanup by XACT_RUNNING_XACT records.
+                */
+               if (IsRunningXactDataValid() && 
+                       TransactionIdIsValid(proc->xid) && !preparedXact)
+               {
+                       ProcArrayRemove(proc, InvalidTransactionId, xlrec->nsubxacts, sub_xids);
+                       FreeRecoveryProcess(proc);
+               }
+
+               /*
+                * Release locks, if any. There are no invalidations to send.
+                */
+               RelationReleaseRecoveryLockTree(xid, xlrec->nsubxacts, sub_xids);
+       }
+
+       /* Make sure nextXid is beyond any XID mentioned in the record */
        if (TransactionIdFollowsOrEquals(max_xid,
                                                                         ShmemVariableCache->nextXid))
        {
                ShmemVariableCache->nextXid = max_xid;
+               ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
                TransactionIdAdvance(ShmemVariableCache->nextXid);
        }
 
@@ -4314,17 +4871,63 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
        /* Backup blocks are not used in xact records */
        Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+       if (info == XLOG_XACT_ASSIGNMENT)
+       {
+               xl_xact_assignment      *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
+
+               if (InArchiveRecovery)
+               {
+                       /*
+                        * Its an assignment record, so we need to need extract data from
+                        * the body of the record, rather than take header values. This
+                        * is because an assignment record can be issued when
+                        * GetCurrentTransactionIdIfAny() returns InvalidTransactionId.
+                        */
+                       (void) RecordKnownAssignedTransactionIds(lsn, xlrec->xtop, 
+                                                                                                               xlrec->xassign);
+               }
+
+               return;
+       }
+       else if (info == XLOG_XACT_RUNNING_XACTS)
+       {
+               xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) XLogRecGetData(record);
+
+               /*
+                * If RunningXact data is complete then apply it
+                */
+               if (InArchiveRecovery && TransactionIdIsValid(xlrec->latestRunningXid))
+               {
+                       if (TransactionIdPrecedes(latestObservedXid, xlrec->latestRunningXid))
+                       {
+                               latestObservedXid = xlrec->latestRunningXid;
+                               ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+                       }
+                       ProcArrayUpdateRecoveryTransactions(lsn, xlrec);
+               }
+
+               return;
+       }
+
+       if (InArchiveRecovery)
+       {
+               /*
+                * No conflict resolution is required for transaction completion records
+                */
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+       }
+
        if (info == XLOG_XACT_COMMIT)
        {
                xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
 
-               xact_redo_commit(xlrec, record->xl_xid);
+               xact_redo_commit(xlrec, record->xl_xid, false);
        }
        else if (info == XLOG_XACT_ABORT)
        {
                xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
 
-               xact_redo_abort(xlrec, record->xl_xid);
+               xact_redo_abort(xlrec, record->xl_xid, false);
        }
        else if (info == XLOG_XACT_PREPARE)
        {
@@ -4336,14 +4939,14 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
        {
                xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record);
 
-               xact_redo_commit(&xlrec->crec, xlrec->xid);
+               xact_redo_commit(&xlrec->crec, xlrec->xid, true);
                RemoveTwoPhaseFile(xlrec->xid, false);
        }
        else if (info == XLOG_XACT_ABORT_PREPARED)
        {
                xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) XLogRecGetData(record);
 
-               xact_redo_abort(&xlrec->arec, xlrec->xid);
+               xact_redo_abort(&xlrec->arec, xlrec->xid, true);
                RemoveTwoPhaseFile(xlrec->xid, false);
        }
        else
@@ -4355,10 +4958,19 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 {
        int                     i;
 
+       if (XactCompletionUpdateDBFile(xlrec))
+               appendStringInfo(buf, "; update db file");
+
+       if (XactCompletionUpdateDBFile(xlrec))
+               appendStringInfo(buf, "; update auth file");
+
+       if (XactCompletionRelcacheInitFileInval(xlrec))
+               appendStringInfo(buf, "; relcache init file inval");
+
        appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
        if (xlrec->nrels > 0)
        {
-               appendStringInfo(buf, "; rels:");
+               appendStringInfo(buf, "; %d rels:", xlrec->nrels);
                for (i = 0; i < xlrec->nrels; i++)
                {
                        char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM);
@@ -4369,12 +4981,34 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
        if (xlrec->nsubxacts > 0)
        {
                TransactionId *xacts = (TransactionId *)
-               &xlrec->xnodes[xlrec->nrels];
-
-               appendStringInfo(buf, "; subxacts:");
+                                                                       &xlrec->xnodes[xlrec->nrels];
+               appendStringInfo(buf, "; %d subxacts:", xlrec->nsubxacts);
                for (i = 0; i < xlrec->nsubxacts; i++)
                        appendStringInfo(buf, " %u", xacts[i]);
        }
+       if (xlrec->nmsgs > 0)
+       {
+               /* 
+                * The invalidation messages are the third variable length array
+                * from the start of the record. The record header has everything
+                * we need to calculate where that starts.
+                */
+               int     offset = OffsetSharedInvalInXactCommit();
+               SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+                                               (((char *) xlrec) + offset);
+               appendStringInfo(buf, "; %d inval msgs:", xlrec->nmsgs);
+               for (i = 0; i < xlrec->nmsgs; i++)
+               {
+                       SharedInvalidationMessage *msg = msgs + i;
+
+                       if (msg->id >= 0)
+                               appendStringInfo(buf,  "catcache id%d ", msg->id);
+                       else if (msg->id == SHAREDINVALRELCACHE_ID)
+                               appendStringInfo(buf,  "relcache ");
+                       else if (msg->id == SHAREDINVALSMGR_ID)
+                               appendStringInfo(buf,  "smgr ");
+               }
+       }
 }
 
 static void
@@ -4404,6 +5038,43 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
        }
 }
 
+static void
+xact_desc_running_xacts(StringInfo buf, xl_xact_running_xacts *xlrec)
+{
+       int                             xid_index,
+                                       subxid_index;
+       TransactionId   *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+
+       appendStringInfo(buf, "nxids %u nsubxids %u latestRunningXid %d",
+                                                               xlrec->xcnt, 
+                                                               xlrec->subxcnt,
+                                                               xlrec->latestRunningXid);
+
+       appendStringInfo(buf, " oldestRunningXid %d latestCompletedXid %d", 
+                                                               xlrec->oldestRunningXid,
+                                                               xlrec->latestCompletedXid);
+
+       for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+       {
+               RunningXact             *rxact = (RunningXact *) xlrec->xrun;
+
+               appendStringInfo(buf, "; xid %d", rxact[xid_index].xid);
+
+               if (rxact[xid_index].nsubxids > 0)
+               {
+                       appendStringInfo(buf, " nsubxids %u offset %d ovflow? %s",
+                                                                       rxact[xid_index].nsubxids,
+                                                                       rxact[xid_index].subx_offset,
+                                                                       (rxact[xid_index].overflowed ? "t" : "f"));
+
+                       appendStringInfo(buf, "; subxacts: ");
+                       for (subxid_index = 0; subxid_index < rxact[xid_index].nsubxids; subxid_index++)
+                               appendStringInfo(buf, " %u", 
+                                               subxip[subxid_index + rxact[xid_index].subx_offset]);
+               }
+       }
+}
+
 void
 xact_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -4441,6 +5112,21 @@ xact_desc(StringInfo buf, uint8 xl_info, char *rec)
                appendStringInfo(buf, "abort %u: ", xlrec->xid);
                xact_desc_abort(buf, &xlrec->arec);
        }
+       else if (info == XLOG_XACT_ASSIGNMENT)
+       {
+               xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
+
+               /* ignore the main xid, it may be Invalid and misleading */
+               appendStringInfo(buf, "assignment: xassign %u xtop %u", 
+                                                       xlrec->xassign, xlrec->xtop);
+       }
+       else if (info == XLOG_XACT_RUNNING_XACTS)
+       {
+               xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) rec;
+
+               appendStringInfo(buf, "running xacts: ");
+               xact_desc_running_xacts(buf, xlrec);
+       }
        else
                appendStringInfo(buf, "UNKNOWN");
 }
index 7e480e2fb2e1375448b4981dddcdbdc2ee4e8a43..fcf5657a23fe59a87dca38df627e55ab3e65d2a1 100644 (file)
@@ -25,6 +25,7 @@
 
 #include "access/clog.h"
 #include "access/multixact.h"
+#include "access/nbtree.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/tuptoaster.h"
@@ -44,6 +45,7 @@
 #include "storage/ipc.h"
 #include "storage/pmsignal.h"
 #include "storage/procarray.h"
+#include "storage/sinval.h"
 #include "storage/smgr.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
@@ -51,6 +53,7 @@
 #include "utils/ps_status.h"
 #include "pg_trace.h"
 
+#define WAL_DEBUG
 
 /* File path names (all relative to $PGDATA) */
 #define BACKUP_LABEL_FILE              "backup_label"
@@ -58,6 +61,8 @@
 #define RECOVERY_COMMAND_FILE  "recovery.conf"
 #define RECOVERY_COMMAND_DONE  "recovery.done"
 
+/* copied from tcopprot.h rather than include whole file */
+extern int     PostAuthDelay;
 
 /* User-settable parameters */
 int                    CheckPointSegments = 3;
@@ -70,7 +75,9 @@ bool          log_checkpoints = false;
 int            sync_method = DEFAULT_SYNC_METHOD;
 
 #ifdef WAL_DEBUG
-bool           XLOG_DEBUG = false;
+bool           XLOG_DEBUG_FLUSH = false;
+bool           XLOG_DEBUG_BGFLUSH = false;
+bool           XLOG_DEBUG_REDO = true;
 #endif
 
 /*
@@ -124,33 +131,51 @@ TimeLineID        ThisTimeLineID = 0;
 bool           InRecovery = false;
 
 /* Are we recovering using offline XLOG archives? */
-static bool InArchiveRecovery = false;
+bool           InArchiveRecovery = false;
+
+static         XLogRecPtr      LastRec;
 
 /* Local copy of shared RecoveryProcessingMode state */
 static bool LocalRecoveryProcessingMode = true;
 static bool knownProcessingMode = false;
 
+/* is the database proven consistent yet? */
+bool   reachedSafeStartPoint = false;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf */
 static char *recoveryRestoreCommand = NULL;
-static bool recoveryTarget = false;
 static bool recoveryTargetExact = false;
 static bool recoveryTargetInclusive = true;
 static bool recoveryLogRestartpoints = false;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
+static int recoveryTargetAdvance = 0;
+
+/* recovery target modes */
+#define        RECOVERY_TARGET_NONE                            0
+#define RECOVERY_TARGET_PAUSE_ALL                      1
+#define RECOVERY_TARGET_PAUSE_XID                      2
+#define RECOVERY_TARGET_PAUSE_TIME                     3
+#define RECOVERY_TARGET_ADVANCE                                4
+#define RECOVERY_TARGET_STOP_IMMEDIATE         5
+#define RECOVERY_TARGET_STOP_XID                       6
+#define RECOVERY_TARGET_STOP_TIME                      7
+static int recoveryTargetMode = RECOVERY_TARGET_NONE; 
+
+#define DEFAULT_MAX_STANDBY_DELAY      0
+int maxStandbyDelay = DEFAULT_MAX_STANDBY_DELAY;
+
 static TimestampTz recoveryLastXTime = 0;
+static TransactionId recoveryLastXid = InvalidTransactionId;
 
 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
 static bool recoveryStopAfter;
 
-/* is the database proven consistent yet? */
-bool   reachedSafeStartPoint = false;
-
 /*
  * During normal operation, the only timeline we care about is ThisTimeLineID.
  * During recovery, however, things are more complicated.  To simplify life
@@ -272,7 +297,7 @@ static XLogRecPtr RedoRecPtr;
  * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
  * prove the databases are in a consistent state. Changing from PM_RECOVERY
  * to PM_RUN happens whenever recovery ends, which could be forced upon us
- * externally or it can occur becasue of damage or termination of the WAL
+ * externally or it can occur because of damage or termination of the WAL
  * sequence.
  *----------
  */
@@ -371,6 +396,20 @@ typedef struct XLogCtlData
        bool            SharedRecoveryProcessingMode;
        slock_t         mode_lck;
 
+       /*
+        * recovery target control information
+        *
+        * Protected by info_lck
+        */
+       int                             recoveryTargetMode;
+       TransactionId   recoveryTargetXid;
+       TimestampTz             recoveryTargetTime;
+       int                             recoveryTargetAdvance;
+
+       TimestampTz     recoveryLastXTime;
+       TransactionId   recoveryLastXid;
+       XLogRecPtr              recoveryLastRecPtr;
+
        char            InfoLockPadding[XLOGCTL_BUFFER_SPACING];
 
        slock_t         info_lck;               /* locks shared variables shown above */
@@ -545,11 +584,14 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
        bool            updrqst;
        bool            doPageWrites;
        bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
-       bool            isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+       bool            isRecoveryEnd = (rmid == RM_XLOG_ID && 
+                                                                       (info == XLOG_RECOVERY_END ||
+                                                                        info == XLOG_CHECKPOINT_ONLINE));
 
        /* cross-check on whether we should be here or not */
        if (IsRecoveryProcessingMode() && !isRecoveryEnd)
-               elog(FATAL, "cannot make new WAL entries during recovery");
+               elog(FATAL, "cannot make new WAL entries during recovery "
+                                       "(RMgrId = %d info = %d)", rmid, info);
 
        /* info's high bits are reserved for use by me */
        if (info & XLR_INFO_MASK)
@@ -888,6 +930,7 @@ begin:;
        record->xl_len = len;           /* doesn't include backup blocks */
        record->xl_info = info;
        record->xl_rmid = rmid;
+       record->xl_topxid = GetTopTransactionIdIfAny();
 
        /* Now we can finish computing the record's CRC */
        COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
@@ -895,25 +938,6 @@ begin:;
        FIN_CRC32(rdata_crc);
        record->xl_crc = rdata_crc;
 
-#ifdef WAL_DEBUG
-       if (XLOG_DEBUG)
-       {
-               StringInfoData buf;
-
-               initStringInfo(&buf);
-               appendStringInfo(&buf, "INSERT @ %X/%X: ",
-                                                RecPtr.xlogid, RecPtr.xrecoff);
-               xlog_outrec(&buf, record);
-               if (rdata->data != NULL)
-               {
-                       appendStringInfo(&buf, " - ");
-                       RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
-               }
-               elog(LOG, "%s", buf.data);
-               pfree(buf.data);
-       }
-#endif
-
        /* Record begin of record in appropriate places */
        ProcLastRecPtr = RecPtr;
        Insert->PrevRecord = RecPtr;
@@ -1804,7 +1828,7 @@ XLogFlush(XLogRecPtr record)
                return;
 
 #ifdef WAL_DEBUG
-       if (XLOG_DEBUG)
+       if (XLOG_DEBUG_FLUSH)
                elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
                         record.xlogid, record.xrecoff,
                         LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
@@ -1954,7 +1978,7 @@ XLogBackgroundFlush(void)
                return;
 
 #ifdef WAL_DEBUG
-       if (XLOG_DEBUG)
+       if (XLOG_DEBUG_BGFLUSH)
                elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
                         WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
                         LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
@@ -3027,6 +3051,9 @@ RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
        char       *blk;
        int                     i;
 
+       if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
+               return;
+
        blk = (char *) XLogRecGetData(record) + record->xl_len;
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
        {
@@ -4456,6 +4483,7 @@ BootStrapXLOG(void)
        record->xl_prev.xlogid = 0;
        record->xl_prev.xrecoff = 0;
        record->xl_xid = InvalidTransactionId;
+       record->xl_topxid = InvalidTransactionId;
        record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
        record->xl_len = sizeof(checkPoint);
        record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
@@ -4639,7 +4667,7 @@ readRecoveryCommandFile(void)
                        ereport(LOG,
                                        (errmsg("recovery_target_xid = %u",
                                                        recoveryTargetXid)));
-                       recoveryTarget = true;
+                       recoveryTargetMode = RECOVERY_TARGET_STOP_XID;
                        recoveryTargetExact = true;
                }
                else if (strcmp(tok1, "recovery_target_time") == 0)
@@ -4650,7 +4678,7 @@ readRecoveryCommandFile(void)
                         */
                        if (recoveryTargetExact)
                                continue;
-                       recoveryTarget = true;
+                       recoveryTargetMode = RECOVERY_TARGET_STOP_TIME;
                        recoveryTargetExact = false;
 
                        /*
@@ -4683,12 +4711,32 @@ readRecoveryCommandFile(void)
                         * does nothing if a recovery_target is not also set
                         */
                        if (!parse_bool(tok2, &recoveryLogRestartpoints))
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                                       errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
+                                 ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                         errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
+                       ereport(LOG,
+                                       (errmsg("log_restartpoints = %s", tok2)));
+               }
+               else if (strcmp(tok1, "max_standby_delay") == 0)
+               {
+                       errno = 0;
+                       maxStandbyDelay = (TransactionId) strtoul(tok2, NULL, 0);
+                       if (errno == EINVAL || errno == ERANGE)
+                               ereport(FATAL,
+                                (errmsg("max_standby_delay is not a valid number: \"%s\"",
+                                                tok2)));
+                       /*
+                        * 2E6 seconds is about 23 days. Allows us to measure delay in
+                        * milliseconds.
+                        */
+                       if (maxStandbyDelay > INT_MAX || maxStandbyDelay < -1)
+                               ereport(FATAL,
+                                (errmsg("max_standby_delay must be between -1 (wait forever) and 2 000 000 secs")));
+
                        ereport(LOG,
-                               (errmsg("log_restartpoints = %s", tok2)));
-               }
+                                       (errmsg("max_standby_delay = %u",
+                                                       maxStandbyDelay)));
+               }
                else
                        ereport(FATAL,
                                        (errmsg("unrecognized recovery parameter \"%s\"",
@@ -4836,8 +4884,8 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 }
 
 /*
- * For point-in-time recovery, this function decides whether we want to
- * stop applying the XLOG at or after the current record.
+ * For archive recovery, this function decides whether we want to
+ * pause or stop applying the XLOG at or after the current record.
  *
  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
  * *includeThis is set TRUE if we should apply this record before stopping.
@@ -4850,72 +4898,275 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 static bool
 recoveryStopsHere(XLogRecord *record, bool *includeThis)
 {
-       bool            stopsHere;
-       uint8           record_info;
-       TimestampTz recordXtime;
-
+       bool            stopsHere = false;
+       bool            pauseHere = false;
+       static bool     paused = false;
+       uint8           record_info = 0;        /* valid iff (is_xact_completion_record) */
+       TimestampTz recordXtime = 0;
+       bool        is_xact_completion_record = false;
+  
        /* We only consider stopping at COMMIT or ABORT records */
-       if (record->xl_rmid != RM_XACT_ID)
-               return false;
-       record_info = record->xl_info & ~XLR_INFO_MASK;
-       if (record_info == XLOG_XACT_COMMIT)
+       if (record->xl_rmid == RM_XACT_ID)
        {
-               xl_xact_commit *recordXactCommitData;
+               record_info = record->xl_info & ~XLR_INFO_MASK;
+               if (record_info == XLOG_XACT_COMMIT)
+               {
+                       xl_xact_commit *recordXactCommitData;
 
-               recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
-               recordXtime = recordXactCommitData->xact_time;
-       }
-       else if (record_info == XLOG_XACT_ABORT)
-       {
-               xl_xact_abort *recordXactAbortData;
+                       recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
+                       recordXtime = recordXactCommitData->xact_time;
+                       is_xact_completion_record = true;
+               }
+               else if (record_info == XLOG_XACT_ABORT)
+               {
+                       xl_xact_abort *recordXactAbortData;
 
-               recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
-               recordXtime = recordXactAbortData->xact_time;
-       }
-       else
-               return false;
+                       recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
+                       recordXtime = recordXactAbortData->xact_time;
+                       is_xact_completion_record = true;
+               }
 
-       /* Do we have a PITR target at all? */
-       if (!recoveryTarget)
-       {
-               recoveryLastXTime = recordXtime;
-               return false;
+               /* Remember the most recent COMMIT/ABORT time for logging purposes */
+               if (is_xact_completion_record)
+               {
+                       recoveryLastXTime = recordXtime;
+                       recoveryLastXid = record->xl_xid;
+               }
        }
 
-       if (recoveryTargetExact)
+       do
        {
+               int     prevRecoveryTargetMode = recoveryTargetMode;    
+
+               CHECK_FOR_INTERRUPTS();
+
                /*
-                * there can be only one transaction end record with this exact
-                * transactionid
-                *
-                * when testing for an xid, we MUST test for equality only, since
-                * transactions are numbered in the order they start, not the order
-                * they complete. A higher numbered xid will complete before you about
-                * 50% of the time...
+                * Let's see if user has updated our recoveryTargetMode.
                 */
-               stopsHere = (record->xl_xid == recoveryTargetXid);
-               if (stopsHere)
-                       *includeThis = recoveryTargetInclusive;
-       }
-       else
-       {
+               {
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile XLogCtlData *xlogctl = XLogCtl;
+
+                       SpinLockAcquire(&xlogctl->info_lck);
+                       recoveryTargetMode = xlogctl->recoveryTargetMode;
+                       if (recoveryTargetMode != RECOVERY_TARGET_NONE)
+                       {
+                               recoveryTargetXid = xlogctl->recoveryTargetXid;
+                               recoveryTargetTime = xlogctl->recoveryTargetTime;
+
+                               /* Don't reset counter while we're advancing */
+                               if (recoveryTargetAdvance <= 0)
+                               {
+                                       recoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
+                                       xlogctl->recoveryTargetAdvance = 0;
+                               }
+                       }
+                       if (is_xact_completion_record)
+                       {
+                               xlogctl->recoveryLastXTime = recordXtime;
+                               xlogctl->recoveryLastXid = record->xl_xid;
+                       }
+                       xlogctl->recoveryLastRecPtr = LastRec;
+                       SpinLockRelease(&xlogctl->info_lck);
+               }
+
+               /* Decide how to act on any pause target */
+               switch (recoveryTargetMode) 
+               {
+                       case RECOVERY_TARGET_NONE:
+                                       /* 
+                                        * If we aren't paused and we're not looking to stop,
+                                        * just exit out quickly and get on with recovery.
+                                        */
+                                       if (paused)
+                                       {
+                                               ereport(LOG, 
+                                                               (errmsg("recovery restarting after pause")));
+                                               set_ps_display("recovery continues", false);
+                                               paused = false;
+                                       }
+                                       return false;
+
+                       case RECOVERY_TARGET_PAUSE_ALL:
+                                       pauseHere = true;
+                                       break;
+
+                       case RECOVERY_TARGET_ADVANCE:
+                                       if (paused)
+                                       {
+                                               if (recoveryTargetAdvance-- > 0)
+                                               {
+                                                       elog(LOG, "recovery advancing 1 record");
+                                                       return false;
+                                               }
+                                               else
+                                                       break;
+                                       }
+
+                                       if (recoveryTargetAdvance-- <= 0)
+                                               pauseHere = true;
+                                       break;
+
+                       case RECOVERY_TARGET_STOP_IMMEDIATE:
+                       case RECOVERY_TARGET_STOP_XID:
+                       case RECOVERY_TARGET_STOP_TIME:
+                                       paused = false;
+                                       break;
+
+                       /*
+                        * If we're paused, and mode has changed reset to allow new settings
+                        * to apply and maybe allow us to continue.
+                        */
+                       if (paused && prevRecoveryTargetMode != recoveryTargetMode)
+                               paused = false;
+
+                       case RECOVERY_TARGET_PAUSE_XID:
+                                       /*
+                                        * there can be only one transaction end record with this exact
+                                        * transactionid
+                                        *
+                                        * when testing for an xid, we MUST test for equality only, since
+                                        * transactions are numbered in the order they start, not the order
+                                        * they complete. A higher numbered xid will complete before you about
+                                        * 50% of the time...
+                                        */
+                                       if (is_xact_completion_record)
+                                               pauseHere = (record->xl_xid == recoveryTargetXid);
+                                       break;
+
+                       case RECOVERY_TARGET_PAUSE_TIME:
+                                       /*
+                                        * there can be many transactions that share the same commit time, so
+                                        * we pause after the last one, if we are inclusive, or pause at the
+                                        * first one if we are exclusive
+                                        */
+                                       if (is_xact_completion_record)
+                                       {
+                                               if (recoveryTargetInclusive)
+                                                       pauseHere = (recoveryLastXTime > recoveryTargetTime);
+                                               else
+                                                       pauseHere = (recoveryLastXTime >= recoveryTargetTime);
+                                       }
+                                       break;
+
+                       default:
+                                       ereport(WARNING,
+                                                       (errmsg("unknown recovery mode %d, continuing recovery", 
+                                                                                       recoveryTargetMode)));
+                                       return false;
+               }
+
                /*
-                * there can be many transactions that share the same commit time, so
-                * we stop after the last one, if we are inclusive, or stop at the
-                * first one if we are exclusive
+                * If we just entered pause, issue log messages
                 */
-               if (recoveryTargetInclusive)
-                       stopsHere = (recordXtime > recoveryTargetTime);
-               else
-                       stopsHere = (recordXtime >= recoveryTargetTime);
-               if (stopsHere)
-                       *includeThis = false;
+               if (pauseHere && !paused)
+               {
+                       if (is_xact_completion_record)
+                       {
+                               if (record_info == XLOG_XACT_COMMIT)
+                                       ereport(LOG,
+                                               (errmsg("recovery pausing before commit of transaction %u, log time %s",
+                                                                       record->xl_xid,
+                                                                       timestamptz_to_str(recoveryLastXTime))));
+                               else
+                                       ereport(LOG,
+                                               (errmsg("recovery pausing before abort of transaction %u, log time %s",
+                                                                       record->xl_xid,
+                                                                       timestamptz_to_str(recoveryLastXTime))));
+                       }
+                       else
+                               ereport(LOG,
+                                               (errmsg("recovery pausing; last recovered transaction %u, "
+                                                               "last recovered xact timestamp %s",
+                                                                       recoveryLastXid,
+                                                                       timestamptz_to_str(recoveryLastXTime))));
+
+                       set_ps_display("recovery paused", false);
+
+                       paused = true;
+               }
+
+               /*
+                * Pause for a while before rechecking mode at top of loop.
+                */
+               if (paused)
+               {
+                       recoveryTargetAdvance = 0;
+
+                       /*
+                        * Update the recoveryTargetMode
+                        */
+                       {
+                               /* use volatile pointer to prevent code rearrangement */
+                               volatile XLogCtlData *xlogctl = XLogCtl;
+
+                               SpinLockAcquire(&xlogctl->info_lck);
+                               xlogctl->recoveryTargetMode = RECOVERY_TARGET_PAUSE_ALL;
+                               xlogctl->recoveryTargetAdvance = 0;
+                               SpinLockRelease(&xlogctl->info_lck);
+                       }
+
+                       pg_usleep(200000L);
+               }
+               
+               /*
+                * We leave the loop at the bottom only if our recovery mode is
+                * set (or has been recently reset) to one of the stop options.
+                */
+       } while (paused);
+
+       /* 
+        * Decide how to act if stop target mode set. We run this separately from 
+        * pause to allow user to reset their stop target while paused.
+        */
+       switch (recoveryTargetMode) 
+       {
+               case RECOVERY_TARGET_STOP_IMMEDIATE:
+                               ereport(LOG,
+                                               (errmsg("recovery stopping immediately due to user request")));
+                               return true;
+
+               case RECOVERY_TARGET_STOP_XID:
+                               /*
+                                * there can be only one transaction end record with this exact
+                                * transactionid
+                                *
+                                * when testing for an xid, we MUST test for equality only, since
+                                * transactions are numbered in the order they start, not the order
+                                * they complete. A higher numbered xid will complete before you about
+                                * 50% of the time...
+                                */
+                               if (is_xact_completion_record)
+                               {
+                                       stopsHere = (record->xl_xid == recoveryTargetXid);
+                                       if (stopsHere)
+                                               *includeThis = recoveryTargetInclusive;
+                               }
+                               break;
+
+               case RECOVERY_TARGET_STOP_TIME:
+                               /*
+                                * there can be many transactions that share the same commit time, so
+                                * we stop after the last one, if we are inclusive, or stop at the
+                                * first one if we are exclusive
+                                */
+                               if (is_xact_completion_record)
+                               {
+                                       if (recoveryTargetInclusive)
+                                               stopsHere = (recoveryLastXTime > recoveryTargetTime);
+                                       else
+                                               stopsHere = (recoveryLastXTime >= recoveryTargetTime);
+                                       if (stopsHere)
+                                               *includeThis = false;
+                               }
+                               break;
        }
 
        if (stopsHere)
        {
+               Assert(is_xact_completion_record);
                recoveryStopXid = record->xl_xid;
-               recoveryStopTime = recordXtime;
+               recoveryStopTime = recoveryLastXTime;
                recoveryStopAfter = *includeThis;
 
                if (record_info == XLOG_XACT_COMMIT)
@@ -4944,14 +5195,289 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
                                                                recoveryStopXid,
                                                                timestamptz_to_str(recoveryStopTime))));
                }
+       }
 
-               if (recoveryStopAfter)
-                       recoveryLastXTime = recordXtime;
+       return stopsHere;
+}
+
+/*
+ * Utility function used by various user functions to set the recovery
+ * target mode. This allows user control over the progress of recovery.
+ */
+static void
+SetRecoveryTargetMode(int mode, TransactionId xid, TimestampTz ts, int advance)
+{
+       if (!superuser())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                                errmsg("must be superuser to control recovery")));
+
+       if (!IsRecoveryProcessingMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is not in progress"),
+                                errhint("WAL control functions can only be executed during recovery.")));
+
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->info_lck);
+               xlogctl->recoveryTargetMode = mode;
+
+               if (mode == RECOVERY_TARGET_STOP_XID || 
+                       mode == RECOVERY_TARGET_PAUSE_XID)
+                       xlogctl->recoveryTargetXid = xid;
+               else if (mode == RECOVERY_TARGET_STOP_TIME || 
+                                 mode == RECOVERY_TARGET_PAUSE_TIME)
+                       xlogctl->recoveryTargetTime = ts;
+               else if (mode == RECOVERY_TARGET_ADVANCE)
+                       xlogctl->recoveryTargetAdvance = advance;
+
+               SpinLockRelease(&xlogctl->info_lck);
+       }
+
+       return;
+}
+
+/*
+ * Forces recovery mode to reset to unfrozen.
+ * Returns void.
+ */
+Datum
+pg_recovery_continue(PG_FUNCTION_ARGS)
+{
+       SetRecoveryTargetMode(RECOVERY_TARGET_NONE, InvalidTransactionId, 0, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery immediately. Stays paused until asked to play again.
+ * Returns void.
+ */
+Datum
+pg_recovery_pause(PG_FUNCTION_ARGS)
+{
+       SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_ALL, InvalidTransactionId, 0, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery at stated xid, if ever seen. Once paused, stays paused
+ * until asked to play again.
+ */
+Datum
+pg_recovery_pause_xid(PG_FUNCTION_ARGS)
+{
+       int                       xidi = PG_GETARG_INT32(0);
+       TransactionId xid = (TransactionId) xidi;
+
+       if (xid < 3)
+               elog(ERROR, "cannot specify special values for transaction id");
+
+       SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_XID, xid, 0, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery at stated timestamp, if ever reached. Once paused, stays paused
+ * until asked to play again.
+ */
+Datum
+pg_recovery_pause_time(PG_FUNCTION_ARGS)
+{
+       TimestampTz ts = PG_GETARG_TIMESTAMPTZ(0);
+
+       SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_TIME, InvalidTransactionId, ts, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * If paused, advance N records.
+ */
+Datum
+pg_recovery_advance(PG_FUNCTION_ARGS)
+{
+       int adv = PG_GETARG_INT32(0);
+
+       if (adv < 1)
+               elog(ERROR, "recovery advance must be greater than or equal to 1");
+
+       SetRecoveryTargetMode(RECOVERY_TARGET_ADVANCE, InvalidTransactionId, 0, adv);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Forces recovery to stop now if paused, or at end of next record if playing.
+ */
+Datum
+pg_recovery_stop(PG_FUNCTION_ARGS)
+{
+       SetRecoveryTargetMode(RECOVERY_TARGET_STOP_IMMEDIATE, InvalidTransactionId, 0, 0);
+
+       PG_RETURN_VOID();
+}
+
+Datum
+pg_current_recovery_target(PG_FUNCTION_ARGS)
+{
+       StringInfoData buf;
+
+       initStringInfo(&buf);
+
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->info_lck);
+
+               recoveryTargetMode = xlogctl->recoveryTargetMode;
+               if (recoveryTargetMode != RECOVERY_TARGET_NONE)
+               {
+                       recoveryTargetXid = xlogctl->recoveryTargetXid;
+                       recoveryTargetTime = xlogctl->recoveryTargetTime;
+                       recoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
+               }
+
+               SpinLockRelease(&xlogctl->info_lck);
+       }
+
+       switch (recoveryTargetMode)
+       {
+               case RECOVERY_TARGET_NONE:
+                               appendStringInfo(&buf, "No recovery target has been set");
+                               break;
+               case RECOVERY_TARGET_PAUSE_ALL:
+                               appendStringInfo(&buf, "Recovery paused");
+                               break;
+               case RECOVERY_TARGET_PAUSE_XID:
+                               appendStringInfo(&buf, "Recovery will pause after commit of transaction %u", recoveryTargetXid);
+                               break;
+               case RECOVERY_TARGET_PAUSE_TIME:
+                               appendStringInfo(&buf, "Recovery will pause after transaction completion timestamp %s", 
+                                                                               timestamptz_to_str(recoveryTargetTime));
+                               break;
+               case RECOVERY_TARGET_ADVANCE:
+                               appendStringInfo(&buf, "Recovery will advance");
+                               break;
+               case RECOVERY_TARGET_STOP_IMMEDIATE:
+                               appendStringInfo(&buf, "No recovery target has been set");
+                               break;
+               case RECOVERY_TARGET_STOP_XID:
+                               appendStringInfo(&buf, "Recovery will stop after commit of transaction %u", recoveryTargetXid);
+                               break;
+               case RECOVERY_TARGET_STOP_TIME:
+                               appendStringInfo(&buf, "Recovery will stop after transaction completion timestamp %s",
+                                                                               timestamptz_to_str(recoveryTargetTime));
+                               break;
+       }
+
+       PG_RETURN_TEXT_P(cstring_to_text(buf.data));
+}
+
+/*
+ * Returns bool with current recovery mode, a global state.
+ */
+Datum
+pg_is_in_recovery(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_BOOL(IsRecoveryProcessingMode());
+}
+
+/*
+ * Returns timestamp of last completed transaction
+ */
+Datum
+pg_last_recovered_xact_timestamp(PG_FUNCTION_ARGS)
+{
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->info_lck);
+
+               recoveryLastXTime = xlogctl->recoveryLastXTime;
+
+               SpinLockRelease(&xlogctl->info_lck);
        }
+
+       PG_RETURN_TIMESTAMPTZ(recoveryLastXTime);
+}
+
+/*
+ * Returns xid of last completed transaction
+ */
+Datum
+pg_last_recovered_xid(PG_FUNCTION_ARGS)
+{
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->info_lck);
+
+               recoveryLastXid = xlogctl->recoveryLastXid;
+
+               SpinLockRelease(&xlogctl->info_lck);
+       }
+
+       PG_RETURN_INT32(recoveryLastXid);
+}
+
+/*
+ * Returns xlog location of last recovered WAL record.
+ */
+Datum
+pg_last_recovered_xlog_location(PG_FUNCTION_ARGS)
+{
+       char            location[MAXFNAMELEN];
+
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->info_lck);
+
+               LastRec = xlogctl->recoveryLastRecPtr;
+
+               SpinLockRelease(&xlogctl->info_lck);
+       }
+
+       snprintf(location, sizeof(location), "%X/%X",
+                        LastRec.xlogid, LastRec.xrecoff);
+       PG_RETURN_TEXT_P(cstring_to_text(location));
+}
+
+/*
+ * Returns delay in milliseconds, or -1 if delay too large
+ */
+int
+GetLatestReplicationDelay(void)
+{
+       long            delay_secs;
+       int                     delay_usecs;
+       int                     delay;
+       TimestampTz currTz = GetCurrentTimestamp();
+
+       TimestampDifference(recoveryLastXTime, currTz,
+                                               &delay_secs, &delay_usecs);
+
+       /*
+        * If delay is very large we probably aren't looking at
+        * a replication situation at all, just a recover from backup.
+        * So return a special value instead.
+        */
+       if (delay_secs > (long)(INT_MAX / 1000))
+               delay = -1;
        else
-               recoveryLastXTime = recordXtime;
+               delay = (int)(delay_secs * 1000) + (delay_usecs / 1000);
 
-       return stopsHere;
+       return delay;
 }
 
 /*
@@ -4967,7 +5493,6 @@ StartupXLOG(void)
        bool            performedRecovery = false;
        bool            haveBackupLabel = false;
        XLogRecPtr      RecPtr,
-                               LastRec,
                                checkPointLoc,
                                minRecoveryLoc,
                                EndOfLog;
@@ -5043,6 +5568,16 @@ StartupXLOG(void)
         */
        readRecoveryCommandFile();
 
+       /*
+        * PostAuthDelay is a debugging aid for investigating problems in startup
+        * and/or recovery: it can be set in postgresql.conf to allow time to
+        * attach to the newly-forked backend with a debugger. It can also be set
+        * using the postmaster -W switch, which can be specified using the -o
+        * option of pg_ctl, e.g. pg_ctl -D data -o "-W 30"
+        */
+       if (PostAuthDelay > 0)
+               pg_usleep(PostAuthDelay * 1000000L);
+
        /* Now we can determine the list of expected TLIs */
        expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
 
@@ -5264,21 +5799,29 @@ StartupXLOG(void)
                        do
                        {
 #ifdef WAL_DEBUG
-                               if (XLOG_DEBUG)
+                               if (XLOG_DEBUG_REDO)
                                {
-                                       StringInfoData buf;
-
-                                       initStringInfo(&buf);
-                                       appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
-                                                                        ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
-                                                                        EndRecPtr.xlogid, EndRecPtr.xrecoff);
-                                       xlog_outrec(&buf, record);
-                                       appendStringInfo(&buf, " - ");
-                                       RmgrTable[record->xl_rmid].rm_desc(&buf,
-                                                                                                          record->xl_info,
-                                                                                                        XLogRecGetData(record));
-                                       elog(LOG, "%s", buf.data);
-                                       pfree(buf.data);
+                                       int             loglevel = DEBUG3;
+
+                                       if (rmid == RM_XACT_ID)
+                                               loglevel = DEBUG2;
+
+                                       if (loglevel >= trace_recovery_messages)
+                                       {
+                                               StringInfoData buf;
+
+                                               initStringInfo(&buf);
+                                               appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+                                                                                ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
+                                                                                EndRecPtr.xlogid, EndRecPtr.xrecoff);
+                                               xlog_outrec(&buf, record);
+                                               appendStringInfo(&buf, " - ");
+                                               RmgrTable[record->xl_rmid].rm_desc(&buf,
+                                                                                                                  record->xl_info,
+                                                                                                                XLogRecGetData(record));
+                                               elog(LOG, "%s", buf.data);
+                                               pfree(buf.data);
+                                       }
                                }
 #endif
 
@@ -5309,32 +5852,41 @@ StartupXLOG(void)
 
                                RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
 
+                               Assert(NumLWLocksHeldByMe() == 0);
+
                                /* Pop the error context stack */
                                error_context_stack = errcontext.previous;
 
                                LastRec = ReadRecPtr;
 
                                /*
-                                * Have we reached our safe starting point? If so, we can
-                                * signal Postmaster to enter consistent recovery mode.
-                                *
-                                * There are two point in the log we must pass. The first is
-                                * the minRecoveryPoint, which is the LSN at the time the
-                                * base backup was taken that we are about to rollfoward from.
-                                * If recovery has ever crashed or was stopped there is 
-                                * another point also: minSafeStartPoint, which we know the
-                                * latest LSN that recovery could have reached prior to crash.
-                                */
-                               if (!reachedSafeStartPoint && 
-                                        XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && 
-                                        XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+                               * Can we signal Postmaster to enter consistent recovery mode?
+                               *
+                               * There are two points in the log that we must pass. The first
+                               * is minRecoveryPoint, which is the LSN at the time the
+                               * base backup was taken that we are about to rollforward from.
+                               * If recovery has ever crashed or was stopped there is also
+                               * another point also: minSafeStartPoint, which we know the
+                               * latest LSN that recovery could have reached prior to crash.
+                               *
+                               * We must also have assembled sufficient information about
+                               * transaction state to allow valid snapshots to be taken.
+                               * In some circumstances that may change, but we only call
+                               * this once, not each time we re-enable snapshots.
+                               */
+                               if (!reachedSafeStartPoint &&
+                                       IsRunningXactDataValid() &&
+                                       XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && 
+                                       XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
                                {
-                                       reachedSafeStartPoint = true;
+                                       reachedSafeStartPoint = true;  /* so we only do this once */
                                        if (InArchiveRecovery)
                                        {
                                                ereport(LOG,
-                                                       (errmsg("consistent recovery state reached at %X/%X",
-                                                               EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+                                                               (errmsg("database has now reached consistent state at %X/%X",
+                                                                               EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+                                               InitRecoveryTransactionEnvironment();
+                                               StartCleanupDelayStats();
                                                if (IsUnderPostmaster)
                                                        SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
                                        }
@@ -5377,14 +5929,14 @@ StartupXLOG(void)
         * Complain if we did not roll forward far enough to render the backup
         * dump consistent and start safely.
         */
-       if (InRecovery && !reachedSafeStartPoint)
+       if (InArchiveRecovery && !reachedSafeStartPoint)
        {
                if (reachedStopPoint)   /* stopped because of stop request */
                        ereport(FATAL,
                                        (errmsg("requested recovery stop point is before end time of backup dump")));
                else    /* ran off end of WAL */
                        ereport(FATAL,
-                                       (errmsg("WAL ends before end time of backup dump")));
+                                       (errmsg("end of WAL reached before end time of backup dump")));
        }
 
        /*
@@ -5515,6 +6067,10 @@ StartupXLOG(void)
        ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
        TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
 
+       /* Shutdown the recovery environment. Must be in this order */
+       ProcArrayClearRecoveryTransactions();
+       RelationClearRecoveryLocks();
+
        /* Start up the commit log and related stuff, too */
        StartupCLOG();
        StartupSUBTRANS(oldestActiveXID);
@@ -5561,19 +6117,29 @@ StartupXLOG(void)
                redo = GetRedoLocationForCheckpoint();
 
                /* 
-                * Tell the bgwriter
-                */
-               SetRedoLocationForArchiveCheckpoint(redo);
-
-               /*
-                * Okay, we can come up now. Allow others to write WAL.
+                * Set up information for the bgwriter, but if it is not active
+                * for whatever reason, perform the checkpoint ourselves.
                 */
-               XLogCtl->SharedRecoveryProcessingMode = false;
+               if (SetRedoLocationForArchiveCheckpoint(redo))
+               {
+                       /*
+                        * Okay, we can come up now. Allow others to write WAL.
+                        */
+                       XLogCtl->SharedRecoveryProcessingMode = false;
 
-               /*
-                * Now request checkpoint
-                */
-               RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+                       /*
+                        * Now request checkpoint from bgwriter.
+                        */
+                       RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+               }
+               else
+               {
+                       /*
+                        * Startup process performs the checkpoint, but defers
+                        * the change in processing mode until afterwards.
+                        */
+                       CreateCheckPoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+               }
        }
        else
        {
@@ -5585,16 +6151,15 @@ StartupXLOG(void)
                ControlFile->time = (pg_time_t) time(NULL);
                UpdateControlFile();
                LWLockRelease(ControlFileLock);
-
-               /*
-                * Okay, we're officially UP.
-                */
-               XLogCtl->SharedRecoveryProcessingMode = false;
        }
 
+       /*
+        * Okay, we can come up now. Allow others to write WAL.
+        */
+       XLogCtl->SharedRecoveryProcessingMode = false;
+
        /* start the archive_timeout timer running */
        XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
-
 }
 
 /*
@@ -5615,6 +6180,9 @@ IsRecoveryProcessingMode(void)
                /* use volatile pointer to prevent code rearrangement */
                volatile XLogCtlData *xlogctl = XLogCtl;
 
+               if (xlogctl == NULL)
+                       return false;
+
                SpinLockAcquire(&xlogctl->mode_lck);
                LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode;
                SpinLockRelease(&xlogctl->mode_lck);
@@ -5882,7 +6450,7 @@ LogCheckpointStart(int flags)
 {
        if (flags & CHECKPOINT_RESTARTPOINT)
                elog(LOG, "restartpoint starting:%s",
-                        (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
+                       (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
        else
                elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
                         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
@@ -6020,51 +6588,51 @@ CreateCheckPoint(int flags)
        checkPoint.ThisTimeLineID = ThisTimeLineID;
        checkPoint.time = (pg_time_t) time(NULL);
 
-       if (leavingArchiveRecovery)
-               checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
-       else
-       {
-               /*
-                * We must hold WALInsertLock while examining insert state to determine
-                * the checkpoint REDO pointer.
-                */
-               LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+       /*
+        * We must hold WALInsertLock while examining insert state to determine
+        * the checkpoint REDO pointer.
+        */
+       LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
-               /*
-                * If this isn't a shutdown or forced checkpoint, and we have not inserted
-                * any XLOG records since the start of the last checkpoint, skip the
-                * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
-                * when the system is idle. That wastes log space, and more importantly it
-                * exposes us to possible loss of both current and previous checkpoint
-                * records if the machine crashes just as we're writing the update.
-                * (Perhaps it'd make even more sense to checkpoint only when the previous
-                * checkpoint record is in a different xlog page?)
-                *
-                * We have to make two tests to determine that nothing has happened since
-                * the start of the last checkpoint: current insertion point must match
-                * the end of the last checkpoint record, and its redo pointer must point
-                * to itself.
-                */
-               if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
+       /*
+        * If this isn't a shutdown or forced checkpoint, and we have not inserted
+        * any XLOG records since the start of the last checkpoint, skip the
+        * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
+        * when the system is idle. That wastes log space, and more importantly it
+        * exposes us to possible loss of both current and previous checkpoint
+        * records if the machine crashes just as we're writing the update.
+        * (Perhaps it'd make even more sense to checkpoint only when the previous
+        * checkpoint record is in a different xlog page?)
+        *
+        * We have to make two tests to determine that nothing has happened since
+        * the start of the last checkpoint: current insertion point must match
+        * the end of the last checkpoint record, and its redo pointer must point
+        * to itself.
+        */
+       if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
+       {
+               XLogRecPtr      curInsert;
+
+               INSERT_RECPTR(curInsert, Insert, Insert->curridx);
+               if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
+                       curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
+                       MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
+                       ControlFile->checkPoint.xlogid ==
+                       ControlFile->checkPointCopy.redo.xlogid &&
+                       ControlFile->checkPoint.xrecoff ==
+                       ControlFile->checkPointCopy.redo.xrecoff)
                {
-                       XLogRecPtr      curInsert;
-
-                       INSERT_RECPTR(curInsert, Insert, Insert->curridx);
-                       if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
-                               curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
-                               MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
-                               ControlFile->checkPoint.xlogid ==
-                               ControlFile->checkPointCopy.redo.xlogid &&
-                               ControlFile->checkPoint.xrecoff ==
-                               ControlFile->checkPointCopy.redo.xrecoff)
-                       {
-                               LWLockRelease(WALInsertLock);
-                               LWLockRelease(CheckpointLock);
-                               END_CRIT_SECTION();
-                               return;
-                       }
+                       LWLockRelease(WALInsertLock);
+                       LWLockRelease(CheckpointLock);
+                       END_CRIT_SECTION();
+                       return;
                }
+       }
 
+       if (leavingArchiveRecovery)
+               checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
+       else
+       {
                /*
                 * Compute new REDO record ptr = location of next XLOG record.
                 *
@@ -6074,14 +6642,14 @@ CreateCheckPoint(int flags)
                 * checkpoint, even though physically before it.  Got that?
                 */
                checkPoint.redo = GetRedoLocationForCheckpoint();
-
-               /*
-                * Now we can release WAL insert lock, allowing other xacts to proceed
-                * while we are flushing disk buffers.
-                */
-               LWLockRelease(WALInsertLock);
        }
 
+       /*
+        * Now we can release WAL insert lock, allowing other xacts to proceed
+        * while we are flushing disk buffers.
+        */
+       LWLockRelease(WALInsertLock);
+
        /*
         * If enabled, log checkpoint start.  We postpone this until now so as not
         * to log anything if we decided to skip the checkpoint.
@@ -6199,18 +6767,15 @@ CreateCheckPoint(int flags)
         * that this is executed by bgwriter after the death of Startup process.
         */
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
        if (shutdown)
                ControlFile->state = DB_SHUTDOWNED;
        else
                ControlFile->state = DB_IN_PRODUCTION;
-
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
        ControlFile->checkPoint = ProcLastRecPtr;
        ControlFile->checkPointCopy = checkPoint;
        ControlFile->time = (pg_time_t) time(NULL);
        UpdateControlFile();
-
        LWLockRelease(ControlFileLock);
 
        if (leavingArchiveRecovery)
@@ -6223,9 +6788,9 @@ CreateCheckPoint(int flags)
                unlink(RECOVERY_COMMAND_DONE);
                if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
                        ereport(ERROR,
-                                       (errcode_for_file_access(),
+                                   (errcode_for_file_access(),
                                         errmsg("could not rename file \"%s\" to \"%s\": %m",
-                                                       RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
+                                                               RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
        }
 
        /* Update shared-memory copy of checkpoint XID/epoch */
@@ -6271,10 +6836,10 @@ CreateCheckPoint(int flags)
         * Truncate pg_subtrans if possible.  We can throw away all data before
         * the oldest XMIN of any running transaction.  No future transaction will
         * attempt to reference any pg_subtrans entry older than that (see Asserts
-        * in subtrans.c).      During recovery, though, we mustn't do this because
-        * StartupSUBTRANS hasn't been called yet.
+        * in subtrans.c).      
         */
-       TruncateSUBTRANS(GetOldestXmin(true, false));
+       if (!shutdown)
+               TruncateSUBTRANS(GetOldestXmin(true, false));
 
        /* All real work is done, but log before releasing lock. */
        if (log_checkpoints)
@@ -6286,8 +6851,19 @@ CreateCheckPoint(int flags)
                                 CheckpointStats.ckpt_segs_recycled);
 
        LWLockRelease(CheckpointLock);
-}
 
+       /*
+        * Take a snapshot of running transactions and write this to WAL.
+        * This allows us to reconstruct the state of running transactions 
+        * during archive recovery, if required.
+        * 
+        * If we are shutting down, or Startup process is completing crash
+        * recovery we don't need to write running xact data.
+        */
+       if (!shutdown && !IsRecoveryProcessingMode())
+               LogCurrentRunningXacts();
+}
 /* 
  * GetRedoLocationForCheckpoint()
  *
@@ -6298,15 +6874,15 @@ static XLogRecPtr
 GetRedoLocationForCheckpoint()
 {
        XLogCtlInsert  *Insert = &XLogCtl->Insert;
-       uint32                  freespace;
-       XLogRecPtr              redo;
+       uint32                  freespace;
+       XLogRecPtr              redo;
 
        freespace = INSERT_FREESPACE(Insert);
        if (freespace < SizeOfXLogRecord)
        {
-               (void) AdvanceXLInsertBuffer(false);
-               /* OK to ignore update return flag, since we will do flush anyway */
-               freespace = INSERT_FREESPACE(Insert);
+               (void) AdvanceXLInsertBuffer(false);
+               /* OK to ignore update return flag, since we will do flush anyway */
+               freespace = INSERT_FREESPACE(Insert);
        }
        INSERT_RECPTR(redo, Insert, Insert->curridx);
 
@@ -6322,12 +6898,12 @@ GetRedoLocationForCheckpoint()
         * their buffer changes are not included in the checkpoint.
         */
        {
-               /* use volatile pointer to prevent code rearrangement */
-               volatile XLogCtlData *xlogctl = XLogCtl;
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
 
-               SpinLockAcquire(&xlogctl->info_lck);
-               RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
-               SpinLockRelease(&xlogctl->info_lck);
+        SpinLockAcquire(&xlogctl->info_lck);
+        RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
+        SpinLockRelease(&xlogctl->info_lck);
        }
 
        return redo;
@@ -6389,7 +6965,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
                if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
                        if (!(RmgrTable[rmid].rm_safe_restartpoint()))
                        {
-                               elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
+                               elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
                                         rmid,
                                         checkPoint->redo.xlogid,
                                         checkPoint->redo.xrecoff);
@@ -6401,30 +6977,30 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 }
 
 /*
- * As of 8.4, RestartPoints are always created by the bgwriter
- * once we have reachedSafeStartPoint. We use bgwriter's shared memory
- * area wherever we call it from, to keep better code structure.
- */
+* As of 8.4, RestartPoints are always created by the bgwriter
+* once we have reachedSafeStartPoint. We use bgwriter's shared memory
+* area wherever we call it from, to keep better code structure.
+*/
 void
 CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags)
 {
-       if (recoveryLogRestartpoints)
+       if (recoveryLogRestartpoints || log_checkpoints)
        {
-               /*
+               /*
                 * Prepare to accumulate statistics.
-                */
+                */
 
                MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
                CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
 
                LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags);
        }
-
-       /*
+  
+       /*
         * Acquire CheckpointLock to ensure only one restartpoint happens at a time.
         * We rely on this lock to ensure that the startup process doesn't exit
         * Recovery while we are half way through a restartpoint.
-        */
+        */
        LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
        CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags);
@@ -6433,11 +7009,11 @@ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int
         * Update pg_control, using current time
         */
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-       ControlFile->prevCheckPoint = ControlFile->checkPoint;
+       ControlFile->prevCheckPoint = ControlFile->checkPoint;
        ControlFile->checkPoint = ReadPtr;
        ControlFile->checkPointCopy = *restartPoint;
-       ControlFile->time = (pg_time_t) time(NULL);
-       UpdateControlFile();
+       ControlFile->time = (pg_time_t) time(NULL);
+       UpdateControlFile();
        LWLockRelease(ControlFileLock);
 
        /*
@@ -6447,21 +7023,23 @@ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int
         */
 
        /* All real work is done, but log before releasing lock. */
-       if (recoveryLogRestartpoints)
+       if (recoveryLogRestartpoints || log_checkpoints)
                LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
-
+  
        ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-                       (errmsg("recovery restart point at %X/%X",
+                       (errmsg("recovery restart point at %X/%X",
                                        restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
 
-       if (recoveryLastXTime)
-               ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+       ReportCleanupDelayStats();
+
+       if (recoveryLastXTime)
+               ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
                        (errmsg("last completed transaction was at log time %s",
                                        timestamptz_to_str(recoveryLastXTime))));
 
        LWLockRelease(CheckpointLock);
 }
-
+  
 /*
  * Write a NEXTOID log record
  */
@@ -6554,7 +7132,7 @@ exitRecovery(void)
        else
        {
                RequestRestartPointCompletion();
-               ereport(LOG,
+               ereport(trace_recovery(DEBUG1),
                        (errmsg("startup process waiting for restartpoint to complete")));
                LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
                LWLockRelease(CheckpointLock);
@@ -6594,6 +7172,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
        {
                Oid                     nextOid;
 
+               if (InArchiveRecovery)
+                       (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
                memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
                if (ShmemVariableCache->nextOid < nextOid)
                {
@@ -6613,11 +7194,15 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                MultiXactSetNextMXact(checkPoint.nextMulti,
                                                          checkPoint.nextMultiOffset);
 
+               /* We know nothing was running on the master at this point */
+               ProcArrayClearRecoveryTransactions();
+               RelationClearRecoveryLocks();
+
                /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
                ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
                ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 
-               /*
+               /*
                 * TLI no longer changes at shutdown checkpoint, since as of 8.4,
                 * shutdown checkpoints only occur at shutdown. Much less confusing.
                 */
@@ -6630,6 +7215,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 
                memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
 
+               /* We know nothing was running on the master at this point */
+               ProcArrayClearRecoveryTransactions();
+               RelationClearRecoveryLocks();
+
                /*
                 * TLI may change when recovery ends, but it shouldn't decrease.
                 *
@@ -6640,17 +7229,17 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                 * new timelineID which is recorded using this record type.
                 */
                if (tli != ThisTimeLineID)
-               {
+               {
                        if (tli < ThisTimeLineID ||
-                               !list_member_int(expectedTLIs,
+                               !list_member_int(expectedTLIs,
                                                                 (int) tli))
-                               ereport(PANIC,
+                               ereport(PANIC,
                                                (errmsg("unexpected timeline ID %u (after %u) at recovery end record",
                                                                tli, ThisTimeLineID)));
-                       /* Following WAL records should be run with new TLI */
+                       /* Following WAL records should be run with new TLI */
                        ThisTimeLineID = tli;
-               }
-       }
+               }
+       }
        else if (info == XLOG_CHECKPOINT_ONLINE)
        {
                CheckPoint      checkPoint;
@@ -6740,6 +7329,10 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
                                         record->xl_xid);
 
+       appendStringInfo(buf, "; pxid %u len %u",
+                                        record->xl_topxid, 
+                                        record->xl_len);
+
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
        {
                if (record->xl_info & XLR_SET_BKP_BLOCK(i))
@@ -6895,6 +7488,12 @@ pg_start_backup(PG_FUNCTION_ARGS)
                                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                                 errmsg("must be superuser to run a backup")));
 
+       if (IsRecoveryProcessingMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
        if (!XLogArchivingActive())
                ereport(ERROR,
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7067,6 +7666,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
                                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                                 (errmsg("must be superuser to run a backup"))));
 
+       if (IsRecoveryProcessingMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
        if (!XLogArchivingActive())
                ereport(ERROR,
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7228,6 +7833,12 @@ pg_switch_xlog(PG_FUNCTION_ARGS)
                                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                         (errmsg("must be superuser to switch transaction log files"))));
 
+       if (IsRecoveryProcessingMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
        switchpoint = RequestXLogSwitch();
 
        /*
@@ -7250,6 +7861,12 @@ pg_current_xlog_location(PG_FUNCTION_ARGS)
 {
        char            location[MAXFNAMELEN];
 
+       if (IsRecoveryProcessingMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
        /* Make sure we have an up-to-date local LogwrtResult */
        {
                /* use volatile pointer to prevent code rearrangement */
@@ -7277,6 +7894,12 @@ pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
        XLogRecPtr      current_recptr;
        char            location[MAXFNAMELEN];
 
+       if (IsRecoveryProcessingMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
        /*
         * Get the current end-of-WAL position ... shared lock is sufficient
         */
index 309fa469adf7fcfdbee9c904b930435e25f313a0..cfbd9d3c469cf414b5fb04fc8b44e4ba0bf75a87 100644 (file)
@@ -404,6 +404,9 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
        /* Backup blocks are not used in smgr records */
        Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
        if (info == XLOG_SMGR_CREATE)
        {
                xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
index 7e065762a84856679e5e39609950ebed18251b44..54786e8a711db4720cc2003b5fda9e80f8fe6e39 100644 (file)
@@ -26,6 +26,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
@@ -52,6 +53,7 @@
 #include "utils/flatfiles.h"
 #include "utils/fmgroids.h"
 #include "utils/guc.h"
+#include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/pg_locale.h"
 #include "utils/snapmgr.h"
@@ -1954,6 +1956,14 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
                src_path = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id);
                dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
 
+               if (InArchiveRecovery)
+               {
+                       /*
+                        * No conflict resolution is required for a create database record
+                        */
+                       (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+               }
+
                /*
                 * Our theory for replaying a CREATE is to forcibly drop the target
                 * subdirectory if present, then re-copy the source data. This may be
@@ -1987,6 +1997,28 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
 
                dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
 
+               if (InArchiveRecovery && 
+                       RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid))
+               {
+                       VirtualTransactionId *database_users;
+
+                       /*
+                        * Find all users connected to this database and ask them
+                        * politely to kill themselves before processing the 
+                        * drop database record, after the usual grace period.
+                        * We don't wait for commit because drop database is
+                        * non-transactional.
+                        */
+                   database_users = GetConflictingVirtualXIDs(InvalidTransactionId, 
+                                                                                                               xlrec->db_id,
+                                                                               InvalidTransactionId);
+
+                       ResolveRecoveryConflictWithVirtualXIDs(database_users,
+                                                                                                       "drop database",
+                                                                                                       FATAL,
+                                                                                                       InvalidXLogRecPtr);
+               }
+
                /* Drop pages for this database that are in the shared buffer cache */
                DropDatabaseBuffers(xlrec->db_id);
 
index 348e6e033f7ada92fd2506720a6fa09d5b10cb7f..9623a6bd77a7125e7a7754b011abaa704ad51681 100644 (file)
@@ -65,7 +65,8 @@ DiscardAll(bool isTopLevel)
        ResetAllOptions();
        DropAllPreparedStatements();
        PortalHashTableDeleteAll();
-       Async_UnlistenAll();
+       if (!IsRecoveryProcessingMode())
+               Async_UnlistenAll();
        LockReleaseAll(USER_LOCKMETHOD, true);
        ResetPlanCache();
        ResetTempTableNamespace();
index 9f7cbc8dbd8cdcd8c5ffe04ba86ce3a0d827952a..2ac9806a0ee2ae85a80ef545ee68dbf55bc46ce1 100644 (file)
@@ -648,7 +648,7 @@ DefineIndex(RangeVar *heapRelation,
         * Also, GetCurrentVirtualXIDs never reports our own vxid, so we need not
         * check for that.
         */
-       old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, false,
+       old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, MyDatabaseId,
                                                                                  PROC_IS_AUTOVACUUM | PROC_IN_VACUUM);
 
        while (VirtualTransactionIdIsValid(*old_snapshots))
index e32b1848522e73ba178eb785cf316267e25e2bd2..fe1e5186947f7a57236e157623f48c4066572b46 100644 (file)
@@ -48,6 +48,16 @@ LockTableCommand(LockStmt *lockstmt)
 
                reloid = RangeVarGetRelid(relation, false);
 
+               /*
+                * During recovery we only accept these variations:
+                *
+                * LOCK TABLE foo       -- parser translates as AccessEclusiveLock request
+                * LOCK TABLE foo IN AccessShareLock MODE
+                * LOCK TABLE foo IN AccessExclusiveLock MODE
+                */
+               if (!(lockstmt->mode == AccessShareLock || lockstmt->mode == AccessExclusiveLock))
+                       PreventCommandDuringRecovery();
+  
                if (recurse)
                        children_and_self = find_all_inheritors(reloid);
                else
index 46d76833771d7ff64c2774adadd14081998157ff..134b7fb1398930b6a8ec1c4dfd5bfff65909189c 100644 (file)
@@ -457,6 +457,8 @@ nextval_internal(Oid relid)
                                rescnt = 0;
        bool            logit = false;
 
+       PreventCommandDuringRecovery();
+
        /* open and AccessShareLock sequence */
        init_sequence(relid, &elm, &seqrel);
 
@@ -1342,6 +1344,11 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record)
        /* Backup blocks are not used in seq records */
        Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
+       RestoreBkpBlocks(lsn, record, false);
+
        if (info != XLOG_SEQ_LOG)
                elog(PANIC, "seq_redo: unknown op code %u", info);
 
index 75f772f0e4ead50062cc22730cfb1fa0f867a3c4..ad5581ab06f34b1578b909c4a9b9e26f50952029 100644 (file)
@@ -51,6 +51,7 @@
 #include "access/heapam.h"
 #include "access/sysattr.h"
 #include "access/xact.h"
+#include "access/transam.h"
 #include "catalog/catalog.h"
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
 #include "miscadmin.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
+#include "storage/procarray.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
 #include "utils/guc.h"
+#include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -1285,6 +1288,15 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record)
                char       *location = xlrec->ts_path;
                char       *linkloc;
 
+               if (InArchiveRecovery)
+               {
+                       /*
+                        * No conflict resolution is required for a create database record
+                        */
+                       (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, 
+                                                                                                                       record->xl_xid);
+               }
+
                /*
                 * Attempt to coerce target directory to safe permissions.      If this
                 * fails, it doesn't exist or has the wrong owner.
@@ -1316,12 +1328,71 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record)
        else if (info == XLOG_TBLSPC_DROP)
        {
                xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
+               bool                            process_conflicts = false;
 
+               /*
+                * Process recovery transaction information
+                */
+               if (InArchiveRecovery)
+                       process_conflicts = RecordKnownAssignedTransactionIds(lsn, 
+                                                                                                       record->xl_topxid, 
+                                                                                                       record->xl_xid);
+               /*
+                * If we issued a WAL record for a drop tablespace it is
+                * because there were no files in it at all. That means that
+                * no permanent objects can exist in it at this point.
+                *
+                * It is possible for standby users to be using this tablespace
+                * as a location for their temporary files, so if we fail to
+                * remove all files then do conflict processing and try again,
+                * if currently enabled.
+                */
                if (!remove_tablespace_directories(xlrec->ts_id, true))
-                       ereport(ERROR,
+               {
+                       if (process_conflicts)
+                       {
+                               VirtualTransactionId *temp_file_users;
+
+                               /*
+                                * Standby users may be currently using this tablespace for
+                                * for their temporary files. We only care about current
+                                * users because temp_tablespace parameter will just ignore
+                                * tablespaces that no longer exist.
+                                * 
+                                * We can work out the pids of currently active backends using
+                                * this tablespace by examining the temp filenames in the 
+                                * directory. We then convert the pids into VirtualXIDs before 
+                                * attempting to cancel them.
+                                *
+                                * We don't wait for commit because drop database is
+                                * non-transactional.
+                                *
+                                * XXXHS: that's the theory, but right now we choose to nuke the
+                                * entire site from orbit, cos its the only way to be sure,
+                                * after the usual grace period.
+                                */
+                               temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+                                                                                                                       InvalidOid, 
+                                                                                                                       InvalidOid);
+
+                               ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
+                                                                                                               "drop tablespace",
+                                                                                                               ERROR,
+                                                                                                               InvalidXLogRecPtr);
+                       }
+
+                       /*
+                        * If we did recovery processing then hopefully the
+                        * backends who wrote temp files should have cleaned up and
+                        * exited by now. So lets recheck before we throw an error.
+                        * If !process_conflicts then this will just fail again.
+                        */             
+                       if (!remove_tablespace_directories(xlrec->ts_id, true))
+                               ereport(ERROR,
                                        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                         errmsg("tablespace %u is not empty",
-                                                       xlrec->ts_id)));
+                                                                       xlrec->ts_id)));
+               }
        }
        else
                elog(PANIC, "tblspc_redo: unknown op code %u", info);
index 783de0246a3b76f2badb7fe778a8fb4cc685fddd..664a135390b2689817b2d23467a7e9d5d12ce530 100644 (file)
@@ -1491,3 +1491,4 @@ DelRoleMems(const char *rolename, Oid roleid,
         */
        heap_close(pg_authmem_rel, NoLock);
 }
+
index 9b46c858f0e585847c7e90cef17d72302e4bbbb4..1599506375bf8ae1e014becdfdd54fb9e3a3f993 100644 (file)
@@ -141,6 +141,7 @@ typedef struct VRelStats
        /* vtlinks array for tuple chain following - sorted by new_tid */
        int                     num_vtlinks;
        VTupleLink      vtlinks;
+       TransactionId   latestRemovedXid;
 } VRelStats;
 
 /*----------------------------------------------------------------------
@@ -224,7 +225,7 @@ static void scan_heap(VRelStats *vacrelstats, Relation onerel,
 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
                        VacPageList vacuum_pages, VacPageList fraged_pages,
                        int nindexes, Relation *Irel);
-static void move_chain_tuple(Relation rel,
+static void move_chain_tuple(VRelStats *vacrelstats, Relation rel,
                                 Buffer old_buf, Page old_page, HeapTuple old_tup,
                                 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
                                 ExecContext ec, ItemPointer ctid, bool cleanVpd);
@@ -237,7 +238,7 @@ static void update_hint_bits(Relation rel, VacPageList fraged_pages,
                                 int num_moved);
 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
                        VacPageList vacpagelist);
-static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
+static void vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage);
 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
                         double num_tuples, int keep_tuples);
 static void scan_index(Relation indrel, double num_tuples);
@@ -1271,6 +1272,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
        vacrelstats->rel_tuples = 0;
        vacrelstats->rel_indexed_tuples = 0;
        vacrelstats->hasindex = false;
+       vacrelstats->latestRemovedXid = InvalidTransactionId;
 
        /* scan the heap */
        vacuum_pages.num_pages = fraged_pages.num_pages = 0;
@@ -1674,6 +1676,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                        {
                                ItemId          lpp;
 
+                               HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, 
+                                                                                       &vacrelstats->latestRemovedXid);
+
                                /*
                                 * Here we are building a temporary copy of the page with dead
                                 * tuples removed.      Below we will apply
@@ -1987,7 +1992,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                /* there are dead tuples on this page - clean them */
                                Assert(!isempty);
                                LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-                               vacuum_page(onerel, buf, last_vacuum_page);
+                               vacuum_page(vacrelstats, onerel, buf, last_vacuum_page);
                                LockBuffer(buf, BUFFER_LOCK_UNLOCK);
                        }
                        else
@@ -2476,7 +2481,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                        tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
                                        tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
 
-                                       move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
+                                       move_chain_tuple(vacrelstats, onerel, Cbuf, Cpage, &tuple,
                                                                         dst_buffer, dst_page, destvacpage,
                                                                         &ec, &Ctid, vtmove[ti].cleanVpd);
 
@@ -2562,7 +2567,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                dst_page = BufferGetPage(dst_buffer);
                                /* if this page was not used before - clean it */
                                if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
-                                       vacuum_page(onerel, dst_buffer, dst_vacpage);
+                                       vacuum_page(vacrelstats, onerel, dst_buffer, dst_vacpage);
                        }
                        else
                                LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
@@ -2739,7 +2744,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                        LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
                        page = BufferGetPage(buf);
                        if (!PageIsEmpty(page))
-                               vacuum_page(onerel, buf, *curpage);
+                               vacuum_page(vacrelstats, onerel, buf, *curpage);
                        UnlockReleaseBuffer(buf);
                }
        }
@@ -2875,7 +2880,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                recptr = log_heap_clean(onerel, buf,
                                                                                NULL, 0, NULL, 0,
                                                                                unused, uncnt,
-                                                                               false);
+                                                                               vacrelstats->latestRemovedXid, false);
                                PageSetLSN(page, recptr);
                                PageSetTLI(page, ThisTimeLineID);
                        }
@@ -2925,7 +2930,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
  *             already too long and almost unreadable.
  */
 static void
-move_chain_tuple(Relation rel,
+move_chain_tuple(VRelStats *vacrelstats, Relation rel,
                                 Buffer old_buf, Page old_page, HeapTuple old_tup,
                                 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
                                 ExecContext ec, ItemPointer ctid, bool cleanVpd)
@@ -2981,7 +2986,7 @@ move_chain_tuple(Relation rel,
                int                     sv_offsets_used = dst_vacpage->offsets_used;
 
                dst_vacpage->offsets_used = 0;
-               vacuum_page(rel, dst_buf, dst_vacpage);
+               vacuum_page(vacrelstats, rel, dst_buf, dst_vacpage);
                dst_vacpage->offsets_used = sv_offsets_used;
        }
 
@@ -3305,7 +3310,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
                        buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno,
                                                                         RBM_NORMAL, vac_strategy);
                        LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-                       vacuum_page(onerel, buf, *vacpage);
+                       vacuum_page(vacrelstats, onerel, buf, *vacpage);
                        UnlockReleaseBuffer(buf);
                }
        }
@@ -3335,7 +3340,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
  * Caller must hold pin and lock on buffer.
  */
 static void
-vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
+vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage)
 {
        Page            page = BufferGetPage(buffer);
        int                     i;
@@ -3364,7 +3369,7 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
                recptr = log_heap_clean(onerel, buffer,
                                                                NULL, 0, NULL, 0,
                                                                vacpage->offsets, vacpage->offsets_free,
-                                                               false);
+                                                               vacrelstats->latestRemovedXid, false);
                PageSetLSN(page, recptr);
                PageSetTLI(page, ThisTimeLineID);
        }
index 59c02e20835ee6a511ebd1c028ce441bbcb5683e..e2bedf3bd1416fc6eadd8da833cb46f4955e2889 100644 (file)
@@ -97,6 +97,7 @@ typedef struct LVRelStats
        ItemPointer dead_tuples;        /* array of ItemPointerData */
        int                     num_index_scans;
        bool            scanned_all;    /* have we scanned all pages (this far)? */
+       TransactionId latestRemovedXid;
 } LVRelStats;
 
 
@@ -246,6 +247,36 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
                *scanned_all = vacrelstats->scanned_all;
 }
 
+/*
+ * For Hot Standby we need to know the highest transaction id that will
+ * be removed by any change. VACUUM proceeds in a number of passes so 
+ * we need to consider how each pass operates. The first pass runs
+ * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
+ * progresses - these will have a latestRemovedXid on each record.
+ * In many cases this removes all of the tuples to be removed.
+ * Then we look at tuples to be removed, but do not actually remove them
+ * until phase three. However, index records for those rows are removed
+ * in phase two and index blocks do not have MVCC information attached.
+ * So before we can allow removal of *any* index tuples we need to issue
+ * a WAL record indicating what the latestRemovedXid will be at the end
+ * of phase three. This then allows Hot Standby queries to block at the
+ * correct place, i.e. before phase two, rather than during phase three
+ * as we issue more XLOG_HEAP2_CLEAN records. If we need to run multiple
+ * phase two/three because of memory constraints we need to issue multiple
+ * log records also.
+ */
+static void
+vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
+{
+       /* 
+        * No need to log changes for temp tables, they do not contain
+        * data visible on the standby server.
+        */
+       if (rel->rd_istemp)
+               return;
+
+       (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
+}
 
 /*
  *     lazy_scan_heap() -- scan an open heap relation
@@ -296,6 +327,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
        nblocks = RelationGetNumberOfBlocks(onerel);
        vacrelstats->rel_pages = nblocks;
        vacrelstats->nonempty_pages = 0;
+       vacrelstats->latestRemovedXid = InvalidTransactionId;
 
        lazy_space_alloc(vacrelstats, nblocks);
 
@@ -354,6 +386,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
                        vacrelstats->num_dead_tuples > 0)
                {
+                       /* Log cleanup info before we touch indexes */
+                       vacuum_log_cleanup_info(onerel, vacrelstats);
+
                        /* Remove index entries */
                        for (i = 0; i < nindexes; i++)
                                lazy_vacuum_index(Irel[i],
@@ -593,6 +628,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                        if (tupgone)
                        {
                                lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+                               HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, 
+                                                                                               &vacrelstats->latestRemovedXid);
                                tups_vacuumed += 1;
                        }
                        else
@@ -703,6 +740,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
        /* XXX put a threshold on min number of tuples here? */
        if (vacrelstats->num_dead_tuples > 0)
        {
+               /* Log cleanup info before we touch indexes */
+               vacuum_log_cleanup_info(onerel, vacrelstats);
+
                /* Remove index entries */
                for (i = 0; i < nindexes; i++)
                        lazy_vacuum_index(Irel[i],
@@ -847,7 +887,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
                recptr = log_heap_clean(onerel, buffer,
                                                                NULL, 0, NULL, 0,
                                                                unused, uncnt,
-                                                               false);
+                                                               vacrelstats->latestRemovedXid, false);
                PageSetLSN(page, recptr);
                PageSetTLI(page, ThisTimeLineID);
        }
index 3163fd3c1bc7e4c39edbaea7db1f915987d2526b..fb479ffe187802bc491846e8190f73768842a6e6 100644 (file)
@@ -210,6 +210,12 @@ BackgroundWriterMain(void)
        BgWriterShmem->bgwriter_pid = MyProcPid;
        am_bg_writer = true;
 
+       BgWriterRecoveryMode = IsRecoveryProcessingMode();
+
+       if (BgWriterRecoveryMode)
+               elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
+                       BgWriterShmem->bgwriter_pid);
+
        /*
         * If possible, make this process a group leader, so that the postmaster
         * can signal any child processes too.  (bgwriter probably never has any
@@ -364,12 +370,6 @@ BackgroundWriterMain(void)
         */
        PG_SETMASK(&UnBlockSig);
 
-       BgWriterRecoveryMode = IsRecoveryProcessingMode();
-
-       if (BgWriterRecoveryMode)
-               elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
-                       BgWriterShmem->bgwriter_pid);
-
        /*
         * Loop forever
         */
@@ -382,101 +382,101 @@ BackgroundWriterMain(void)
                if (!PostmasterIsAlive(true))
                        exit(1);
 
-               /*
-                * Process any requests or signals received recently.
-                */
-               AbsorbFsyncRequests();
-
                if (got_SIGHUP)
                {
                        got_SIGHUP = false;
                        ProcessConfigFile(PGC_SIGHUP);
                }
 
-               if (BgWriterRecoveryMode)
-               {
-                       if (shutdown_requested)
-                       {
-                               /*
-                                * From here on, elog(ERROR) should end with exit(1), not send
-                                * control back to the sigsetjmp block above
-                                */
-                               ExitOnAnyError = true;
-                               /* Normal exit from the bgwriter is here */
-                               proc_exit(0);           /* done */
-                       }
-                       if (!IsRecoveryProcessingMode())
-                       {
-                               elog(DEBUG2, "bgwriter changing from recovery to normal mode");
-                               InitXLOGAccess();
-                               BgWriterRecoveryMode = false;
-                               /*
-                                * Start time-driven events from now
-                                */
-                               last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
-                               /* 
-                                * Notice that we do *not* act on a checkpoint_requested
-                                * state at this point. We have changed mode, so we wish to
-                                * perform a checkpoint not a restartpoint.
-                                */
-                               continue;
-                       }
-                       if (checkpoint_requested)
-                       {
-                               XLogRecPtr              ReadPtr;
-                               CheckPoint              restartPoint;
-                               checkpoint_requested = false;
-                               /*
-                                * Initialize bgwriter-private variables used during checkpoint.
-                                */
-                               ckpt_active = true;
-                               ckpt_start_time = (pg_time_t) time(NULL);
-                               ckpt_cached_elapsed = 0;
-                               /*
-                                * Get the requested values from shared memory that the 
-                                * Startup process has put there for us.
-                                */
-                               SpinLockAcquire(&BgWriterShmem->ckpt_lck);
-                               ReadPtr = BgWriterShmem->ReadPtr;
-                               memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
-                               SpinLockRelease(&BgWriterShmem->ckpt_lck);
-                               /* Use smoothed writes, until interrupted if ever */
-                               CreateRestartPoint(ReadPtr, &restartPoint, 0);
-                               /*
-                                * After any checkpoint, close all smgr files.  This is so we
-                                * won't hang onto smgr references to deleted files indefinitely.
-                                */
-                               smgrcloseall();
-                               ckpt_active = false;
-                               checkpoint_requested = false;
-                       }
-                       else
-                       {
-                               /* Clean buffers dirtied by recovery */
-                               BgBufferSync();
-                               /* Nap for the configured time. */
-                               BgWriterNap();
-                       }
-               }
+               if (BgWriterRecoveryMode)
+               {
+                       if (shutdown_requested)
+                       {
+                               /*
+                                * From here on, elog(ERROR) should end with exit(1), not send
+                                * control back to the sigsetjmp block above
+                                */
+                               ExitOnAnyError = true;
+                               /* Normal exit from the bgwriter is here */
+                               proc_exit(0);           /* done */
+                       }
+
+                       if (!IsRecoveryProcessingMode())
+                       {
+                               elog(DEBUG2, "bgwriter changing from recovery to normal mode");
+         
+                               InitXLOGAccess();
+                               BgWriterRecoveryMode = false;
+
+                               /*
+                                * Start time-driven events from now
+                                */
+                               last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
+
+                               /* 
+                                * Notice that we do *not* act on a checkpoint_requested
+                                * state at this point. We have changed mode, so we wish to
+                                * perform a checkpoint not a restartpoint.
+                                */
+                               continue;
+                       }
+
+                       if (checkpoint_requested)
+                       {
+                               XLogRecPtr              ReadPtr;
+                               CheckPoint              restartPoint;
+
+                               checkpoint_requested = false;
+
+                               /*
+                                * Initialize bgwriter-private variables used during checkpoint.
+                                */
+                               ckpt_active = true;
+                               ckpt_start_time = (pg_time_t) time(NULL);
+                               ckpt_cached_elapsed = 0;
+
+                               /*
+                                * Get the requested values from shared memory that the 
+                                * Startup process has put there for us.
+                                */
+                               SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+                               ReadPtr = BgWriterShmem->ReadPtr;
+                               memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
+                               SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+                               /* Use smoothed writes, until interrupted if ever */
+                               CreateRestartPoint(ReadPtr, &restartPoint, 0);
+
+                               /*
+                                * After any checkpoint, close all smgr files.  This is so we
+                                * won't hang onto smgr references to deleted files indefinitely.
+                                */
+                               smgrcloseall();
+
+                               ckpt_active = false;
+                               checkpoint_requested = false;
+                       }
+                       else
+                       {
+                               /* Clean buffers dirtied by recovery */
+                               BgBufferSync();
+
+                               /* Nap for the configured time. */
+                               BgWriterNap();
+                       }
+               }
                else    /* Normal processing */
-               {
+               {
                        bool            do_checkpoint = false;
                        int                     flags = 0;
                        pg_time_t       now;
                        int                     elapsed_secs;
 
+                       /*
+                        * Process any requests or signals received recently.
+                        */
+                       AbsorbFsyncRequests();
+
                        if (checkpoint_requested)
                        {
                                checkpoint_requested = false;
@@ -1122,14 +1122,6 @@ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bo
        if (BgWriterShmem->bgwriter_pid == 0)
                elog(LOG, "could not request restartpoint because bgwriter not running");
 
-#ifdef NOT_USED
-       elog(LOG, "tli = %u nextXidEpoch = %u nextXid = %u nextOid = %u",
-               restartPoint->ThisTimeLineID,
-               restartPoint->nextXidEpoch,
-               restartPoint->nextXid,
-               restartPoint->nextOid);
-#endif
-
        SpinLockAcquire(&BgWriterShmem->ckpt_lck);
        BgWriterShmem->ReadPtr = ReadPtr;
        memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
@@ -1164,12 +1156,22 @@ GetRedoLocationForArchiveCheckpoint(void)
        return redo;
 }
 
-void
+/* 
+ * Store the information needed for a checkpoint at the end of recovery.
+ * Returns true if bgwriter can perform checkpoint, or false if bgwriter
+ * not active or otherwise unable to comply.
+ */
+bool
 SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo)
 {
        SpinLockAcquire(&BgWriterShmem->ckpt_lck);
        BgWriterShmem->ReadPtr = redo;
        SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+       if (BgWriterShmem->bgwriter_pid == 0 || !IsPostmasterEnvironment)
+               return false;
+
+       return true;
 }
 
 /*
index 5cb84be4b80a1097fca1f7d0377c013759f5c7fc..9c026313c3eb33b52933135d1e6ed5c35e322275 100644 (file)
@@ -230,8 +230,10 @@ static bool FatalError = false; /* T if recovering from backend crash */
  * We use a simple state machine to control startup, shutdown, and
  * crash recovery (which is rather like shutdown followed by startup).
  *
- * Normal child backends can only be launched when we are in PM_RUN state.
- * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
+ * Normal child backends can only be launched when we are in PM_RUN or
+ * PM_RECOVERY state. Any transaction started in PM_RECOVERY state will
+ * be read-only for the whole of its life.  (We also allow launch of normal
+ * child backends in PM_WAIT_BACKUP state, but only for superusers.)
  * In other states we handle connection requests by launching "dead_end"
  * child processes, which will simply send the client an error message and
  * quit.  (We track these in the BackendList so that we can know when they
@@ -1656,11 +1658,6 @@ retry1:
                                        (errcode(ERRCODE_CANNOT_CONNECT_NOW),
                                         errmsg("the database system is shutting down")));
                        break;
-               case CAC_RECOVERY:
-                       ereport(FATAL,
-                                       (errcode(ERRCODE_CANNOT_CONNECT_NOW),
-                                        errmsg("the database system is in recovery mode")));
-                       break;
                case CAC_TOOMANY:
                        ereport(FATAL,
                                        (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
@@ -1669,6 +1666,7 @@ retry1:
                case CAC_WAITBACKUP:
                        /* OK for now, will check in InitPostgres */
                        break;
+               case CAC_RECOVERY:
                case CAC_OK:
                        break;
        }
@@ -1987,10 +1985,11 @@ pmdie(SIGNAL_ARGS)
                        ereport(LOG,
                                        (errmsg("received smart shutdown request")));
 
-                       if (pmState == PM_RUN)
+                       if (pmState == PM_RUN || pmState == PM_RECOVERY)
                        {
                                /* autovacuum workers are told to shut down immediately */
-                               SignalAutovacWorkers(SIGTERM);
+                               if (pmState == PM_RUN)
+                                       SignalAutovacWorkers(SIGTERM);
                                /* and the autovac launcher too */
                                if (AutoVacPID != 0)
                                        signal_child(AutoVacPID, SIGTERM);
@@ -2024,7 +2023,7 @@ pmdie(SIGNAL_ARGS)
 
                        if (StartupPID != 0)
                                signal_child(StartupPID, SIGTERM);
-                       if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
+                       if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_WAIT_BACKUP)
                        {
                                ereport(LOG,
                                                (errmsg("aborting any active transactions")));
@@ -2120,8 +2119,11 @@ reaper(SIGNAL_ARGS)
                 */
                if (pid == StartupPID)
                {
+                       bool    leavingRecovery = (pmState == PM_RECOVERY);
+
                        StartupPID = 0;
-                       Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
+                       Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
+                                  pmState == PM_WAIT_BACKUP || pmState == PM_WAIT_BACKENDS);
 
                        /* FATAL exit of startup is treated as catastrophic */
                        if (!EXIT_STATUS_0(exitstatus))
@@ -2129,7 +2131,7 @@ reaper(SIGNAL_ARGS)
                                LogChildExit(LOG, _("startup process"),
                                                         pid, exitstatus);
                                ereport(LOG,
-                               (errmsg("aborting startup due to startup process failure")));
+                                               (errmsg("aborting startup due to startup process failure")));
                                ExitPostmaster(1);
                        }
 
@@ -2166,7 +2168,7 @@ reaper(SIGNAL_ARGS)
                         * already running.
                         */
                        if (BgWriterPID == 0)
-                               BgWriterPID = StartBackgroundWriter();
+                       BgWriterPID = StartBackgroundWriter();
 
                        /*
                         * Likewise, start other special children as needed.  In a restart
@@ -2182,8 +2184,12 @@ reaper(SIGNAL_ARGS)
                                PgStatPID = pgstat_start();
 
                        /* at this point we are really open for business */
-                       ereport(LOG,
-                                (errmsg("database system is ready to accept connections")));
+                       if (leavingRecovery)
+                               ereport(LOG,
+                                        (errmsg("database can now be accessed with read and write transactions")));
+                       else
+                               ereport(LOG,
+                                        (errmsg("database system is ready to accept connections")));
 
                        continue;
                }
@@ -2903,7 +2909,8 @@ BackendStartup(Port *port)
        bn->pid = pid;
        bn->cancel_key = MyCancelKey;
        bn->is_autovacuum = false;
-       bn->dead_end = (port->canAcceptConnections != CAC_OK &&
+       bn->dead_end = (!(port->canAcceptConnections == CAC_RECOVERY || 
+                                         port->canAcceptConnections == CAC_OK) &&
                                        port->canAcceptConnections != CAC_WAITBACKUP);
        DLAddHead(BackendList, DLNewElem(bn));
 #ifdef EXEC_BACKEND
@@ -3854,8 +3861,6 @@ sigusr1_handler(SIGNAL_ARGS)
 
        if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
        {
-               Assert(pmState == PM_STARTUP);
-
                /*
                 * Go to shutdown mode if a shutdown request was pending.
                 */
@@ -3864,11 +3869,12 @@ sigusr1_handler(SIGNAL_ARGS)
                        pmState = PM_WAIT_BACKENDS;
                        /* PostmasterStateMachine logic does the rest */
                }
-               else
+               else if (pmState == PM_STARTUP)
                {
                        /*
                         * Startup process has entered recovery
                         */
+                       
                        pmState = PM_RECOVERY;
 
                        /*
@@ -3891,9 +3897,11 @@ sigusr1_handler(SIGNAL_ARGS)
                        Assert(PgStatPID == 0);
                        PgStatPID = pgstat_start();
 
-                       /* XXX at this point we could accept read-only connections */
-                       ereport(DEBUG1,
-                                (errmsg("database system is in consistent recovery mode")));
+                       /* We can now accept read-only connections */
+                       ereport(LOG,
+                                (errmsg("database system is ready to accept connections")));
+                       ereport(LOG,
+                                (errmsg("database can now be accessed with read only transactions")));
                }
        }
 
index a7b81e37a710e65fba277ac465752c126d69886c..7fb1621c88ee5301e875ec3604cbbd7fa1f5ce1c 100644 (file)
@@ -270,10 +270,11 @@ We might miss a hint-bit update or two but that isn't a problem, for the same
 reasons mentioned under buffer access rules.
 
 As of 8.4, background writer starts during recovery mode when there is
-some form of potentially extended recovery to perform. It performs an
-identical service to normal processing, except that checkpoints it
-writes are technically restartpoints. Flushing outstanding WAL for dirty
-buffers is also skipped, though there shouldn't ever be new WAL entries
-at that time in any case. We could choose to start background writer
-immediately but we hold off until we can prove the database is in a 
-consistent state so that postmaster has a single, clean state change.
+some form of potentially extended recovery to perform. We perform cleaning
+of dirty blocks and enacting restartpoints when requested by the startup
+process. Most other bgwriter functions are skipped, such as flushing 
+outstanding WAL for dirty buffers since no new WAL has been written. 
+We could choose to start background writer immediately but we wait until we 
+can prove the database is in a consistent state. This allows the postmaster 
+to have a single, clean state change between the initial stages of recovery 
+and the main recovery mode. 
index bd053d503de04f98d202fb2e386529be251bc9fc..4108e2578ba5ce2d50d8dc1f64faa9d3429dd2c0 100644 (file)
@@ -33,6 +33,7 @@
 #include <sys/file.h>
 #include <unistd.h>
 
+#include "access/xlogdefs.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
@@ -78,7 +79,13 @@ static bool IsForInput;
 
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
+static long            CleanupWaitSecs = 0;
+static int             CleanupWaitUSecs = 0;
+static bool            CleanupWaitStats = false;
 
+/* local state for recovery conflict processing */
+static bool                    BufferRecoveryConflictPending = false;
+static XLogRecPtr      BufferRecoveryConflictLSN;
 
 static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
                                        ForkNumber forkNum, BlockNumber blockNum,
@@ -100,7 +107,8 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
                        bool *foundPtr);
 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
-
+static void BufferProcessRecoveryConflictsIfAny(volatile  BufferDesc *bufHdr);
+       
 
 /*
  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -306,6 +314,8 @@ ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
                        /* Just need to update stats before we exit */
                        *hit = true;
 
+                       BufferProcessRecoveryConflictsIfAny(bufHdr);
+
                        if (VacuumCostActive)
                                VacuumCostBalance += VacuumCostPageHit;
 
@@ -419,6 +429,8 @@ ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
                                                                        blockNum,
                                                                        relpath(smgr->smgr_rnode, forkNum))));
                        }
+
+                       BufferProcessRecoveryConflictsIfAny(bufHdr);
                }
        }
 
@@ -1580,6 +1592,38 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
        return result | BUF_WRITTEN;
 }
 
+static void
+BufferProcessRecoveryConflictsIfAny(volatile BufferDesc *bufHdr)
+{
+       if (!BufferRecoveryConflictPending)
+               return;
+       else
+       {
+               XLogRecPtr      bufLSN = BufferGetLSN(bufHdr);
+
+               if (XLByteLE(bufLSN, BufferRecoveryConflictLSN))
+                       ereport(ERROR,
+                               (errcode(ERRCODE_QUERY_CANCELED),
+                                errmsg("canceling statement due to recent buffer changes during recovery")));
+       }
+}
+
+bool
+SetBufferRecoveryConflictLSN(XLogRecPtr conflict_LSN)
+{
+       if (XLogRecPtrIsValid(conflict_LSN))
+       {
+               BufferRecoveryConflictPending = true;
+               BufferRecoveryConflictLSN = conflict_LSN;
+       }
+       else
+       {
+               BufferRecoveryConflictPending = false;
+               BufferRecoveryConflictLSN = InvalidXLogRecPtr;
+       }
+
+       return BufferRecoveryConflictPending;
+}
 
 /*
  * Return a palloc'd string containing buffer usage statistics.
@@ -2364,6 +2408,53 @@ ConditionalLockBuffer(Buffer buffer)
        return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
 }
 
+/*
+ * On standby servers only the Startup process applies Cleanup. As a result
+ * a single buffer pin can be enough to effectively halt recovery for short
+ * periods. We need special instrumentation to monitor this so we can judge
+ * whether additional measures are required to control the negative effects.
+ */
+void
+StartCleanupDelayStats(void)
+{
+       CleanupWaitSecs = 0;
+       CleanupWaitUSecs = 0;
+       CleanupWaitStats = true;
+}
+
+void
+EndCleanupDelayStats(void)
+{
+       CleanupWaitStats = false;
+}
+
+/* 
+ * Called by Startup process whenever we request restartpoint
+ */
+void
+ReportCleanupDelayStats(void)
+{
+       elog(trace_recovery(DEBUG2), "cleanup wait total=%ld.%03d s",
+                               CleanupWaitSecs, CleanupWaitUSecs / 1000);
+}
+
+static void
+CleanupDelayStats(TimestampTz start_ts, TimestampTz end_ts)
+{
+       long                    wait_secs;
+       int                             wait_usecs;
+
+       TimestampDifference(start_ts, end_ts, &wait_secs, &wait_usecs);
+
+       CleanupWaitSecs +=wait_secs;
+       CleanupWaitUSecs +=wait_usecs;
+       if (CleanupWaitUSecs > 999999)
+       {
+               CleanupWaitSecs += 1;
+               CleanupWaitUSecs -= 1000000;
+       }
+}
+
 /*
  * LockBufferForCleanup - lock a buffer in preparation for deleting items
  *
@@ -2407,6 +2498,8 @@ LockBufferForCleanup(Buffer buffer)
 
        for (;;)
        {
+               TimestampTz     start_ts = 0;
+
                /* Try to acquire lock */
                LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
                LockBufHdr(bufHdr);
@@ -2429,9 +2522,14 @@ LockBufferForCleanup(Buffer buffer)
                PinCountWaitBuf = bufHdr;
                UnlockBufHdr(bufHdr);
                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+               if (CleanupWaitStats)
+                       start_ts = GetCurrentTimestamp();
                /* Wait to be signaled by UnpinBuffer() */
                ProcWaitForSignal();
                PinCountWaitBuf = NULL;
+               if (CleanupWaitStats)
+                       CleanupDelayStats(start_ts, GetCurrentTimestamp());
+               
                /* Loop back and try again */
        }
 }
index 06f8ad8f4a1a519b70e4138fae8264af62466810..92150c10b16baf03752a6c831c63c0699de88918 100644 (file)
  * as are the myProcLocks lists.  They can be distinguished from regular
  * backend PGPROCs at need by checking for pid == 0.
  *
+ * The process array now also includes PGPROC structures representing
+ * transactions being recovered. The xid and subxids fields of these are valid,
+ * though few other fields are.  They can be distinguished from regular backend
+ * PGPROCs by checking for pid == 0.  The proc array also has an
+ * secondary array of UnobservedXids representing transactions that are
+ * known to be running on the master but for which we do not yet have
+ * a recovery proc. We infer the existence of UnobservedXids by watching 
+ * the sequence of arriving xids. This is very important because if we leave 
+ * those xids out of the snapshot then they will appear to be already complete. 
+ * Later, when they have actually completed this could lead to confusion as to 
+ * whether those xids are visible or not, blowing a huge hole in MVCC. 
+ * We need 'em.
+ * 
+ * Although we have max_connections procs during recovery, they will only
+ * be used when the master is running a write transaction. Read only
+ * transactions never show up in WAL at all and it is valid to ignore them.
+ * So we would only ever use all max_connections procs is we were running
+ * a write transaction on every session at once. As a result, we may be
+ * able to continue running normally even if max_connections is set lower
+ * on the standby than on the master.
+ *
+ * It is theoretically possible for a FATAL error to explode before writing
+ * an abort record. This would then tie up a recovery proc until the next
+ * WAL record containing a valid list of running xids arrives. This is
+ * relatively unlikely, so considered both a minor and an acceptable flaw
+ * in the emulation of transactions during recovery.
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
 
 #include "access/subtrans.h"
 #include "access/transam.h"
-#include "access/xact.h"
+#include "access/xlog.h"
 #include "access/twophase.h"
 #include "miscadmin.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/snapmgr.h"
 
+static RunningXactsData        CurrentRunningXactsData;
+
+void ProcArrayDisplay(int trace_level);
+
 
 /* Our shared memory area */
 typedef struct ProcArrayStruct
 {
        int                     numProcs;               /* number of valid procs entries */
-       int                     maxProcs;               /* allocated size of procs array */
+       int                     maxProcs;                       /* allocated size of total procs array */
+
+       int                     numUnobservedXids;      /* number of valid unobserved xids */
+       int                     maxUnobservedXids;      /* allocated size of unobserved array */
+
+       bool            allowStandbySnapshots;  /* can queries take snapshots? */
 
        /*
         * We declare procs[] as 1 entry because C wants a fixed-size array, but
         * actually it is maxProcs entries long.
         */
        PGPROC     *procs[1];           /* VARIABLE LENGTH ARRAY */
+
+       /* ARRAY OF UNOBSERVED TRANSACTION XIDs FOLLOWS */
 } ProcArrayStruct;
 
 static ProcArrayStruct *procArray;
@@ -100,8 +138,18 @@ ProcArrayShmemSize(void)
        Size            size;
 
        size = offsetof(ProcArrayStruct, procs);
-       size = add_size(size, mul_size(sizeof(PGPROC *),
-                                                                add_size(MaxBackends, max_prepared_xacts)));
+
+       /* Normal processing */
+       /* MyProc slots */
+       size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends));
+       size = add_size(size, mul_size(sizeof(PGPROC *), max_prepared_xacts));
+
+       /* Recovery processing */
+
+       /* Recovery Procs */
+       size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends));
+       /* UnobservedXids */
+       size = add_size(size, mul_size(sizeof(TransactionId), 2 * MaxBackends));
 
        return size;
 }
@@ -123,8 +171,27 @@ CreateSharedProcArray(void)
                /*
                 * We're the first - initialize.
                 */
+               /* Normal processing */
                procArray->numProcs = 0;
                procArray->maxProcs = MaxBackends + max_prepared_xacts;
+
+               /* Recovery processing */
+               procArray->maxProcs += MaxBackends;
+
+               procArray->allowStandbySnapshots = false;
+
+               /*
+                * The max number of UnobservedXids is theoretically unbounded
+                * because of a very slim chance of FATAL errors that fail to
+                * write abort records. However, in normal running each
+                * session will have at most 2 xids assigned without having 
+                * written a WAL record, so we set a reasonable limit accordingly.
+                * UnobservedXids typically has length 0 or 1, though can be
+                * longer if there is high contention for data blocks.
+                * If you change this, also change ProcArrayShmemSize()
+                */
+               procArray->maxUnobservedXids = 2 * MaxBackends;
+               procArray->numUnobservedXids = 0;
        }
 }
 
@@ -132,11 +199,12 @@ CreateSharedProcArray(void)
  * Add the specified PGPROC to the shared array.
  */
 void
-ProcArrayAdd(PGPROC *proc)
+ProcArrayAdd(PGPROC *proc, bool need_lock)
 {
        ProcArrayStruct *arrayP = procArray;
 
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+       if (need_lock)
+               LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
        if (arrayP->numProcs >= arrayP->maxProcs)
        {
@@ -154,13 +222,15 @@ ProcArrayAdd(PGPROC *proc)
        arrayP->procs[arrayP->numProcs] = proc;
        arrayP->numProcs++;
 
-       LWLockRelease(ProcArrayLock);
+       if (need_lock)
+               LWLockRelease(ProcArrayLock);
 }
 
 /*
  * Remove the specified PGPROC from the shared array.
  *
- * When latestXid is a valid XID, we are removing a live 2PC gxact from the
+ * When latestXid is a valid XID, it is either an emulated transaction during
+ * recovery or removing a live 2PC gxact that we wish to remove from the
  * array, and thus causing it to appear as "not running" anymore.  In this
  * case we must advance latestCompletedXid.  (This is essentially the same
  * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
@@ -168,7 +238,8 @@ ProcArrayAdd(PGPROC *proc)
  * twophase.c depends on the latter.)
  */
 void
-ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
+ProcArrayRemove(PGPROC *proc, TransactionId latestXid,
+                               int nsubxids, TransactionId *subxids)
 {
        ProcArrayStruct *arrayP = procArray;
        int                     index;
@@ -181,6 +252,15 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 
        LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
+       /*
+        * Remove any UnobservedXids remaining
+        */
+       if (IsRecoveryProcessingMode())
+       {
+               for (index = 0; index < nsubxids; index++)
+                       UnobservedTransactionsRemoveXid(subxids[index], false);
+       }
+
        if (TransactionIdIsValid(latestXid))
        {
                Assert(TransactionIdIsValid(proc->xid));
@@ -193,7 +273,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
        else
        {
                /* Shouldn't be trying to remove a live transaction here */
-               Assert(!TransactionIdIsValid(proc->xid));
+               Assert(IsRecoveryProcessingMode() || !TransactionIdIsValid(proc->xid));
        }
 
        for (index = 0; index < arrayP->numProcs; index++)
@@ -213,6 +293,15 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
        elog(LOG, "failed to find proc %p in ProcArray", proc);
 }
 
+/*
+ * Initialisation when we switch into PM_RECOVERY mode.
+ * Expected caller is InitRecoveryTransactionEnvironment()
+ */
+void
+ProcArrayInitRecoveryEnvironment(void)
+{
+       PublishStartupProcessInformation();
+}
 
 /*
  * ProcArrayEndTransaction -- mark a transaction as no longer running
@@ -301,6 +390,7 @@ ProcArrayClearTransaction(PGPROC *proc)
        proc->xid = InvalidTransactionId;
        proc->lxid = InvalidLocalTransactionId;
        proc->xmin = InvalidTransactionId;
+       proc->lsn = InvalidXLogRecPtr;
 
        /* redundant, but just in case */
        proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
@@ -311,6 +401,309 @@ ProcArrayClearTransaction(PGPROC *proc)
        proc->subxids.overflowed = false;
 }
 
+/*
+ * ProcArrayClearRecoveryTransactions
+ *
+ * Called during recovery when we see a Shutdown checkpoint or EndRecovery
+ * record, or at the end of recovery processing.
+ */
+void
+ProcArrayClearRecoveryTransactions(void)
+{
+       ProcArrayStruct *arrayP = procArray;
+       int                     index;
+
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       /*
+        * Reset recovery procs, which is any proc that has a valid xid.
+        */
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               volatile PGPROC *proc = arrayP->procs[index];
+
+               if (TransactionIdIsValid(proc->xid) && proc->pid == 0)
+               {
+                       arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
+                       arrayP->numProcs--;
+               }
+       }
+
+       /*
+        * Clear the UnobservedXids also
+        */
+       UnobservedTransactionsClearXids();
+
+       LWLockRelease(ProcArrayLock);
+}
+
+/* debug support functions for recovery processing */
+bool
+XidInRecoveryProcs(TransactionId xid)
+{
+       ProcArrayStruct *arrayP = procArray;
+       int                             index;
+
+       if (!TransactionIdIsValid(xid))
+               return false;
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               PGPROC  *RecoveryProc = arrayP->procs[index];
+
+               if (RecoveryProc->xid == xid)
+                       return true;
+       }
+       return false;
+}
+
+void
+ProcArrayDisplay(int trace_level)
+{
+       ProcArrayStruct *arrayP = procArray;
+       int                     index;
+
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               PGPROC  *RecoveryProc = arrayP->procs[index];
+
+               if (TransactionIdIsValid(RecoveryProc->xid))
+                       elog(trace_level,
+                                       "proc %d proc->xid %d proc->lsn %X/%X", index, RecoveryProc->xid, 
+                                                               RecoveryProc->lsn.xlogid, RecoveryProc->lsn.xrecoff);
+       }
+
+       UnobservedTransactionsDisplay(trace_level);
+
+       LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayUpdateRecoveryTransactions -- initialise the proc array in recovery
+ *
+ * Use the data about running transactions on master to either create the
+ * initial state of the recovery procs, or maintain correctness of their
+ * state. In a sense this is almost the opposite of GetSnapshotData(), 
+ * since we are updating the proc array based upon the snapshot. We do this
+ * as a cross-check that the proc array is correctly maintained, because
+ * we know it is possible that some transactions with FATAL errors do not
+ * write abort records and also to create the initial state of the procarray.
+ *
+ * Only used during recovery. Notice the signature is very similar to a
+ * _redo function.
+ */
+void
+ProcArrayUpdateRecoveryTransactions(XLogRecPtr lsn, xl_xact_running_xacts *xlrec)
+{
+       ProcArrayStruct *arrayP = procArray;
+       int                             xid_index;      /* main loop */
+       int                     index;
+
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       /*
+        * Scan the proc array for stale recovery PGPROC entries, and
+        * remove them. This shouldn't happen, except when FATAL error
+        * caused us to skip the abort record, but we don't want to stop
+        * recovery because of this. Be careful not to confuse super-fresh
+        * with stale, because of race conditions as noted above.
+        * We remove stale entries first to free up their proc entries.
+        */
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               PGPROC  *p = arrayP->procs[index];
+
+               if (TransactionIdPrecedes(p->xid, xlrec->oldestRunningXid) && p->pid == 0)
+               {
+                       elog(LOG, "removing stale proc array entry for transaction %d", p->xid);
+
+                       arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
+                       arrayP->numProcs--;
+                       FreeRecoveryProcess(p);
+               }
+       }
+
+       /*
+        * Left prune the UnobservedXids array up to latestRunningXid.
+        * This is correct because at the time we take this snapshot, all
+        * completed transactions prior to latestRunningXid will be marked in
+        * WAL or they are explicitly present here.
+        *
+        * We can't clear the array completely because race conditions allow
+        * things to slip through sometimes.
+        */
+       UnobservedTransactionsPruneXids(xlrec->latestRunningXid);
+
+       /*
+        * Scan through the incoming array of RunningXacts and update the
+        * proc array entries so that they match as much as possible.
+        */     
+       for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+       {
+               RunningXact             *rxact = (RunningXact *) xlrec->xrun;
+               PGPROC                  *proc = NULL;
+               TransactionId   xid = rxact[xid_index].xid;
+               bool    unobserved = false;
+
+               /*
+                * Look up the incoming xids in the existing proc array.
+                *
+                * XXXHS: This gives O(N^2) behaviour. We could sort the list of
+                * procs first to improve performance if both lists are long.
+                */
+               for (index = 0; index < arrayP->numProcs; index++)
+               {
+                       PGPROC  *p = arrayP->procs[index];
+
+                       if (p->xid == xid)
+                       {
+                               proc = p;
+                               break;
+                       }
+               }
+
+               /*
+                * Create procs for any missing xids, with warning if appropriate.
+                * We do this differently from RecordKnownAssignedTransactionIds()
+                * because here we have a better and possibly full knowledge of 
+                * subtransactions.
+                */
+               if (proc == NULL)
+               {
+                       unobserved = XidInUnobservedTransactions(xid);
+
+                       if (!procArray->allowStandbySnapshots || unobserved ||
+                               (!TransactionIdDidCommit(xid) && !TransactionIdDidAbort(xid)))
+                       {
+                               proc = CreateRecoveryProcessForTransactionId(xid);
+
+                               if (proc == NULL)
+                               {
+                                       /* 
+                                        * If we've run out of recovery procs then don't bother
+                                        * to process any further. No more snapshots for a while.
+                                        */
+                                       ProcArrayClearRecoveryTransactions();
+                                       LWLockRelease(ProcArrayLock);
+                                       return;
+                               }
+
+                               if (unobserved)
+                                       UnobservedTransactionsRemoveXid(xid, true);
+                       }
+                       else
+                       {
+                               /* 
+                                * It's possible for a commit or abort to have arrived in WAL
+                                * between us doing GetRunningTransactionData() and grabbing
+                                * the WALInsertLock. Issue a debug message, but thats all.
+                                */
+                               elog(DEBUG2, "proc array entry was missing for transaction %d", xid);
+                               continue;
+                       }
+               }
+
+               /*
+                * If our state information is later for this proc, then 
+                * overwrite it. It's possible for a commit and possibly
+                * a new transaction record to have arrived in WAL in between
+                * us doing GetRunningTransactionData() and grabbing the
+                * WALInsertLock, so we musn't assume we always know best.
+                */
+               if (XLByteLT(proc->lsn, lsn))
+               {
+                       TransactionId   *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+
+                       proc->lsn = lsn;
+                       /* proc-> pid stays 0 for Recovery Procs */
+
+                       proc->subxids.nxids = rxact[xid_index].nsubxids;
+                       proc->subxids.overflowed = rxact[xid_index].overflowed;
+
+                       memcpy(proc->subxids.xids, subxip, 
+                                               rxact[xid_index].nsubxids * sizeof(TransactionId));
+
+                       /* Remove subtransactions from UnobservedXids also */
+                       if (unobserved)
+                       {
+                               for (index = 0; index < rxact[xid_index].nsubxids; index++)
+                                       UnobservedTransactionsRemoveXid(subxip[index + rxact[xid_index].subx_offset], false);
+                       }
+               }
+
+               elog(trace_recovery(DEBUG5), 
+                       "running xact proc->lsn %X/%X lsn %X/%X proc->xid %d xid %d",
+                               proc->lsn.xlogid, proc->lsn.xrecoff,
+                               lsn.xlogid, lsn.xrecoff, proc->xid, rxact[xid_index].xid);
+       }
+
+       /* Advance global latestCompletedXid while holding the lock */
+       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
+                                                         xlrec->latestCompletedXid))
+               ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+
+       /*
+        * If we fully applied the RunningXact data then we can (re)open 
+        * for business.
+        */
+       procArray->allowStandbySnapshots = true;
+       SetRunningXactData(true);
+
+       LWLockRelease(ProcArrayLock);
+
+       ProcArrayDisplay(trace_recovery(DEBUG5));
+}
+
+/*
+ * CreateRecoveryProcessForTransactionId 
+ *
+ * Create recovery process and add it to proc array, or throw a warning.
+ *
+ * Must be called with ProcArrayLock held, stays held at exit
+ */
+PGPROC *
+CreateRecoveryProcessForTransactionId(TransactionId xid)
+{
+       PGPROC                  *proc = NULL;
+
+       proc = InitRecoveryProcess(xid);
+
+       /*
+        * Was there a recovery proc free? If not, punt. It might be possible 
+        * to wedge stuff into UnobservedXids, but the code to do this would 
+        * be complex and difficult to test.
+        */
+       if (proc == NULL)
+       {
+               ereport(WARNING,
+                       (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+                        errmsg("insufficient recovery procs - standby snapshots disabled"),
+                        errdetail("Recovery will continue but standby queries will"
+                               " consistently fail until either more resources are"
+                               " allocated or the transaction load reduces"
+                               " on the master server - not this standby server."),
+                        errhint("Increase the \"max_connections\" parameter"
+                           " and restart the server.")));
+
+               /*
+                * We have now set allowStandbySnapshots = false and we will refuse 
+                * further snapshots until at least the next RunningXact WAL record 
+                * arrives, though we wait until the data all fits in our recovery 
+                * procs. This may be a very long time: minutes/hours/days+, but 
+                * the important thing is that recovery continues.
+                */
+               procArray->allowStandbySnapshots = false;
+               SetRunningXactData(false);
+
+               return NULL;
+       }
+
+       ProcArrayAdd(proc, false);
+
+       return proc;
+}
 
 /*
  * TransactionIdIsInProgress -- is given transaction running in some backend
@@ -589,6 +982,9 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
        TransactionId result;
        int                     index;
 
+       /* Cannot look for individual databases during recovery */
+       Assert(allDbs || !IsRecoveryProcessingMode());
+
        LWLockAcquire(ProcArrayLock, LW_SHARED);
 
        /*
@@ -655,7 +1051,7 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
  * but since PGPROC has only a limited cache area for subxact XIDs, full
  * information may not be available.  If we find any overflowed subxid arrays,
  * we have to mark the snapshot's subxid data as overflowed, and extra work
- * will need to be done to determine what's running (see XidInMVCCSnapshot()
+ * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
  * in tqual.c).
  *
  * We also update the following backend-global variables:
@@ -680,6 +1076,7 @@ GetSnapshotData(Snapshot snapshot)
        int                     index;
        int                     count = 0;
        int                     subcount = 0;
+       bool            suboverflowed = false;
 
        Assert(snapshot != NULL);
 
@@ -707,7 +1104,7 @@ GetSnapshotData(Snapshot snapshot)
                                         errmsg("out of memory")));
                Assert(snapshot->subxip == NULL);
                snapshot->subxip = (TransactionId *)
-                       malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+                       malloc((arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS) * sizeof(TransactionId));
                if (snapshot->subxip == NULL)
                        ereport(ERROR,
                                        (errcode(ERRCODE_OUT_OF_MEMORY),
@@ -720,6 +1117,16 @@ GetSnapshotData(Snapshot snapshot)
         */
        LWLockAcquire(ProcArrayLock, LW_SHARED);
 
+       if (IsRecoveryProcessingMode() && !arrayP->allowStandbySnapshots)
+       {
+               LWLockRelease(ProcArrayLock);
+               ereport(ERROR,
+                       (errcode(ERRCODE_QUERY_CANCELED),
+                        errmsg("canceling statement because standby snapshots are currently disabled"),
+                        errdetail("Valid MVCC snapshot cannot be taken at this time."),
+                        errhint("Contact your administrator if this error recurs frequently")));
+       }
+
        /* xmax is always latestCompletedXid + 1 */
        xmax = ShmemVariableCache->latestCompletedXid;
        Assert(TransactionIdIsNormal(xmax));
@@ -771,11 +1178,11 @@ GetSnapshotData(Snapshot snapshot)
                }
 
                /*
-                * Save subtransaction XIDs if possible (if we've already overflowed,
-                * there's no point).  Note that the subxact XIDs must be later than
-                * their parent, so no need to check them against xmin.  We could
-                * filter against xmax, but it seems better not to do that much work
-                * while holding the ProcArrayLock.
+                * Save subtransaction XIDs, whether or not we have overflowed. 
+                * Note that the subxact XIDs must be later than their parent, so no
+                * need to check them against xmin.  We could filter against xmax, 
+                * but it seems better not to do that much work while holding the 
+                * ProcArrayLock.
                 *
                 * The other backend can add more subxids concurrently, but cannot
                 * remove any.  Hence it's important to fetch nxids just once. Should
@@ -784,23 +1191,69 @@ GetSnapshotData(Snapshot snapshot)
                 *
                 * Again, our own XIDs are not included in the snapshot.
                 */
-               if (subcount >= 0 && proc != MyProc)
-               {
-                       if (proc->subxids.overflowed)
-                               subcount = -1;  /* overflowed */
-                       else
+               if (proc != MyProc)
                        {
                                int                     nxids = proc->subxids.nxids;
 
                                if (nxids > 0)
                                {
+                                       if (proc->subxids.overflowed)
+                                               suboverflowed = true;
+
                                        memcpy(snapshot->subxip + subcount,
                                                   (void *) proc->subxids.xids,
                                                   nxids * sizeof(TransactionId));
                                        subcount += nxids;
                                }
+
                        }
                }
+
+       /*
+        * Also check for unobserved xids. There is no need for us to specify
+        * only if IsRecoveryProcessingMode(), since the list will always be
+        * empty when normal processing begins and the test will be optimised
+        * to nearly nothing very quickly.
+        */
+       for (index = 0; index < arrayP->numUnobservedXids; index++)
+       {
+               volatile TransactionId  *UnobservedXids;
+               TransactionId   xid;
+
+               UnobservedXids = (TransactionId *) &(arrayP->procs[arrayP->maxProcs]);
+
+               /* Fetch xid just once - see GetNewTransactionId */
+               xid = UnobservedXids[index];
+
+               /*
+                * If there are no more visible xids, we're done. This works
+                * because UnobservedXids is maintained in strict ascending order.
+                */
+               if (!TransactionIdIsNormal(xid) || TransactionIdPrecedes(xid, xmax))
+                       break;
+
+               /*
+                * Typically, there will be space in the snapshot. We know that the
+                * unobserved xids are being run by one of the procs marked with
+                * an xid of InvalidTransactionId, so we will have ignored that above,
+                * and the xidcache for that proc will have been empty also.
+                *
+                * We put the unobserved xids into the subxid cache. The xid might
+                * be a top-level or it might be a subtransaction, but it won't
+                * change the answer to XidInMVCCSnapshot() whichever it is. That's
+                * just as well, since we don't know which it is, by definition.
+                * The subxid cache gets searched first, so put it there.
+                */
+               snapshot->subxip[subcount++] = xid;
+
+               /*
+                * We don't really need xmin during recovery, but lets derive
+                * it anyway for consistency. It is possible that an unobserved
+                * xid could be xmin if there is contention between long-lived 
+                * transactions.
+                */
+               if (TransactionIdPrecedes(xid, xmin))
+                       xmin = xid;
        }
 
        if (!TransactionIdIsValid(MyProc->xmin))
@@ -824,6 +1277,7 @@ GetSnapshotData(Snapshot snapshot)
        snapshot->xmax = xmax;
        snapshot->xcnt = count;
        snapshot->subxcnt = subcount;
+       snapshot->suboverflowed = suboverflowed;
 
        snapshot->curcid = GetCurrentCommandId(false);
 
@@ -839,32 +1293,223 @@ GetSnapshotData(Snapshot snapshot)
 }
 
 /*
- * GetTransactionsInCommit -- Get the XIDs of transactions that are committing
+ * GetRunningTransactionData -- returns information about running transactions.
  *
- * Constructs an array of XIDs of transactions that are currently in commit
- * critical sections, as shown by having inCommit set in their PGPROC entries.
+ * Similar to GetSnapshotData but returning more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes. We
+ * also keep track of which subtransactions go with each PGPROC. All of this
+ * looks very similar to GetSnapshotData, but we have more procs and more info
+ * about each proc.
  *
- * *xids_p is set to a palloc'd array that should be freed by the caller.
- * The return value is the number of valid entries.
+ * This is never executed during recovery so there is no need to look at
+ * UnobservedXids.
  *
- * Note that because backends set or clear inCommit without holding any lock,
- * the result is somewhat indeterminate, but we don't really care.  Even in
- * a multiprocessor with delayed writes to shared memory, it should be certain
- * that setting of inCommit will propagate to shared memory when the backend
- * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if
- * it's already inserted its commit record.  Whether it takes a little while
- * for clearing of inCommit to propagate is unimportant for correctness.
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
  */
-int
-GetTransactionsInCommit(TransactionId **xids_p)
+RunningTransactions
+GetRunningTransactionData(void)
 {
        ProcArrayStruct *arrayP = procArray;
-       TransactionId *xids;
-       int                     nxids;
+       static RunningTransactions CurrentRunningXacts = (RunningTransactions) &CurrentRunningXactsData;
+       RunningXact     *rxact;
+       TransactionId *subxip;
+       TransactionId latestRunningXid = InvalidTransactionId;
+       TransactionId latestCompletedXid;
+       TransactionId oldestRunningXid = InvalidTransactionId;
        int                     index;
+       int                     count = 0;
+       int                     subcount = 0;
+       bool            suboverflowed = false;
 
-       xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId));
-       nxids = 0;
+       /*
+        * Allocating space for maxProcs xids is usually overkill; numProcs would
+        * be sufficient.  But it seems better to do the malloc while not holding
+        * the lock, so we can't look at numProcs.  Likewise, we allocate much
+        * more subxip storage than is probably needed.
+        *
+        * Should only be allocated for bgwriter, since only ever executed
+        * during checkpoints.
+        */
+       if (CurrentRunningXacts->xrun == NULL)
+       {
+               /*
+                * First call
+                */
+               CurrentRunningXacts->xrun = (RunningXact *)
+                       malloc(arrayP->maxProcs * sizeof(RunningXact));
+               if (CurrentRunningXacts->xrun == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+               Assert(CurrentRunningXacts->subxip == NULL);
+               CurrentRunningXacts->subxip = (TransactionId *)
+                       malloc((arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS) * sizeof(TransactionId));
+               if (CurrentRunningXacts->subxip == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+       }
+
+       rxact = CurrentRunningXacts->xrun;
+       subxip = CurrentRunningXacts->subxip;
+
+       count = 0;
+       subcount = 0;
+       suboverflowed = false;
+
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+       latestCompletedXid = ShmemVariableCache->latestCompletedXid;
+
+       /*
+        * Spin over procArray checking xid, and subxids. Shared lock is enough
+        * because new transactions don't use locks at all, so LW_EXCLUSIVE
+        * wouldn't be enough to prevent them, so don't bother.
+        */
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               volatile PGPROC *proc = arrayP->procs[index];
+               TransactionId xid;
+               int                     nxids;
+
+               /* Fetch xid just once - see GetNewTransactionId */
+               xid = proc->xid;
+
+               /*
+                * We store all xids, even XIDs >= xmax and our own XID, if any.
+                * But we don't store transactions that don't have a TransactionId
+                * yet because they will not show as running on a standby server.
+                */
+               if (!TransactionIdIsValid(xid))
+                       continue;
+
+               rxact[count].xid = xid;
+
+               if (TransactionIdPrecedes(latestRunningXid, xid))
+                       latestRunningXid = xid;
+
+               if (!TransactionIdIsValid(oldestRunningXid) || 
+                       TransactionIdPrecedes(xid, oldestRunningXid))
+                       oldestRunningXid = xid;
+
+               /*
+                * Save subtransaction XIDs. 
+                *
+                * The other backend can add more subxids concurrently, but cannot
+                * remove any.  Hence it's important to fetch nxids just once. Should
+                * be safe to use memcpy, though.  (We needn't worry about missing any
+                * xids added concurrently, because they must postdate xmax.)
+                *
+                * Again, our own XIDs *are* included in the snapshot.
+                */
+               nxids = proc->subxids.nxids;
+
+               if (nxids > 0)
+               {
+                       TransactionId *subxids = (TransactionId *) proc->subxids.xids;
+
+                       rxact[count].subx_offset = subcount;
+
+                       memcpy(subxip + subcount,
+                                  (void *) proc->subxids.xids,
+                                  nxids * sizeof(TransactionId));
+                       subcount += nxids;
+
+                       if (proc->subxids.overflowed)
+                       {
+                               rxact[count].overflowed = true;
+                               suboverflowed = true;
+                       }
+                       
+                       if (TransactionIdPrecedes(latestRunningXid, subxids[nxids - 1]))
+                               latestRunningXid = subxids[nxids - 1];
+               }
+               else
+               {
+                       rxact[count].subx_offset = 0;
+                       rxact[count].overflowed = false;
+               }
+
+               rxact[count].nsubxids = nxids;
+               count++;
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       /*
+        * When there are no transactions running, just use the value
+        * of the last completed transaction. No need to check
+        * ReadNewTransactionId().
+        */
+       if (count == 0)
+               latestRunningXid = latestCompletedXid;
+
+       CurrentRunningXacts->xcnt = count;
+       CurrentRunningXacts->subxcnt = subcount;
+       CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+       CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+       if (suboverflowed)
+               CurrentRunningXacts->latestRunningXid = InvalidTransactionId;
+       else
+               CurrentRunningXacts->latestRunningXid = latestRunningXid;
+
+#ifdef RUNNING_XACT_DEBUG
+       elog(trace_recovery(DEBUG3), 
+                                       "logging running xacts xcnt %d subxcnt %d latestCompletedXid %d latestRunningXid %d",
+                                       CurrentRunningXacts->xcnt,
+                                       CurrentRunningXacts->subxcnt,
+                                       CurrentRunningXacts->latestCompletedXid,
+                                       CurrentRunningXacts->latestRunningXid);
+
+       for (index = 0; index < CurrentRunningXacts->xcnt; index++)
+       {
+               int j;
+               elog(trace_recovery(DEBUG3), 
+                                       "xid %d nsubxids %d offset %d, ovflow %s",
+                                       CurrentRunningXacts->xrun[index].xid,
+                                       CurrentRunningXacts->xrun[index].nsubxids,
+                                       CurrentRunningXacts->xrun[index].subx_offset,
+                                       CurrentRunningXacts->xrun[index].overflowed ? "t" : "f");
+               for (j = 0; j < CurrentRunningXacts->xrun[index].nsubxids; j++)
+                       elog(trace_recovery(DEBUG3), 
+                                       "subxid offset %d j %d xid %d", 
+                                       CurrentRunningXacts->xrun[index].subx_offset, j,
+                                       CurrentRunningXacts->subxip[j + CurrentRunningXacts->xrun[index].subx_offset]);
+       }
+#endif
+
+       return CurrentRunningXacts;
+}
+
+/*
+ * GetTransactionsInCommit -- Get the XIDs of transactions that are committing
+ *
+ * Constructs an array of XIDs of transactions that are currently in commit
+ * critical sections, as shown by having inCommit set in their PGPROC entries.
+ *
+ * *xids_p is set to a palloc'd array that should be freed by the caller.
+ * The return value is the number of valid entries.
+ *
+ * Note that because backends set or clear inCommit without holding any lock,
+ * the result is somewhat indeterminate, but we don't really care.  Even in
+ * a multiprocessor with delayed writes to shared memory, it should be certain
+ * that setting of inCommit will propagate to shared memory when the backend
+ * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if
+ * it's already inserted its commit record.  Whether it takes a little while
+ * for clearing of inCommit to propagate is unimportant for correctness.
+ */
+int
+GetTransactionsInCommit(TransactionId **xids_p)
+{
+       ProcArrayStruct *arrayP = procArray;
+       TransactionId *xids;
+       int                     nxids;
+       int                     index;
+
+       xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId));
+       nxids = 0;
 
        LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -967,6 +1612,41 @@ BackendPidGetProc(int pid)
        return result;
 }
 
+/*
+ * BackendXidGetProc -- get a backend's PGPROC given its XID
+ *
+ * Returns NULL if not found.  Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
+ */
+PGPROC *
+BackendXidGetProc(TransactionId xid)
+{
+       PGPROC     *result = NULL;
+       ProcArrayStruct *arrayP = procArray;
+       int                     index;
+
+       if (xid == InvalidTransactionId)        /* never match invalid xid */
+               return 0;
+
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               PGPROC     *proc = arrayP->procs[index];
+
+               if (proc->xid == xid)
+               {
+                       result = proc;
+                       break;
+               }
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       return result;
+}
+
 /*
  * BackendXidGetPid -- get a backend's pid given its XID
  *
@@ -1024,13 +1704,14 @@ IsBackendPid(int pid)
  * The array is palloc'd and is terminated with an invalid VXID.
  *
  * If limitXmin is not InvalidTransactionId, we skip any backends
- * with xmin >= limitXmin.     If allDbs is false, we skip backends attached
+ * with xmin >= limitXmin.     If dbOid is valid we skip backends attached
  * to other databases.  If excludeVacuum isn't zero, we skip processes for
  * which (excludeVacuum & vacuumFlags) is not zero.  Also, our own process
  * is always skipped.
+ * 
  */
 VirtualTransactionId *
-GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
+GetCurrentVirtualXIDs(TransactionId limitXmin, Oid dbOid, int excludeVacuum)
 {
        VirtualTransactionId *vxids;
        ProcArrayStruct *arrayP = procArray;
@@ -1047,13 +1728,13 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
        {
                volatile PGPROC *proc = arrayP->procs[index];
 
-               if (proc == MyProc)
+               if (proc == MyProc || proc->pid == 0)
                        continue;
 
                if (excludeVacuum & proc->vacuumFlags)
                        continue;
 
-               if (allDbs || proc->databaseId == MyDatabaseId)
+               if (!OidIsValid(dbOid) || proc->databaseId == dbOid)
                {
                        /* Fetch xmin just once - might change on us? */
                        TransactionId pxmin = proc->xmin;
@@ -1083,6 +1764,117 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
        return vxids;
 }
 
+/*
+ * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd and is terminated with an invalid VXID.
+ *
+ * If limitXmin is not InvalidTransactionId, we skip any backends
+ * with xmin >= limitXmin.     If dbOid is valid we skip backends attached
+ * to other databases.  If roleId is valid we skip backends attached
+ * as other roles.
+ *
+ * Be careful to *not* pfree the result from this function. We reuse
+ * this array sufficiently often that we use malloc for the result.
+ * We only ever call
+ */
+VirtualTransactionId *
+GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid, Oid roleId)
+{
+       static VirtualTransactionId *vxids;
+       ProcArrayStruct *arrayP = procArray;
+       int                     count = 0;
+       int                     index;
+
+       /*
+        * If not first time through, get workspace to remember main XIDs in. We
+        * malloc it permanently to avoid repeated palloc/pfree overhead.
+        * Allow result space, remembering room for a terminator.
+        */
+       if (vxids == NULL)
+       {
+               vxids = (VirtualTransactionId *)
+                       malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+               if (vxids == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+       }
+
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               volatile PGPROC *proc = arrayP->procs[index];
+
+               /* Exclude recovery procs and prepared transactions */
+               if (proc->pid == 0)
+                       continue;
+
+               if ((!OidIsValid(dbOid) && !OidIsValid(roleId)) || 
+                       (proc->databaseId == dbOid && !OidIsValid(roleId)) ||
+                       (OidIsValid(dbOid) && proc->roleId == roleId))
+               {
+                       /* Fetch xmin just once - can't change on us, but good coding */
+                       TransactionId pxmin = proc->xmin;
+
+                       /*
+                        * If limitXmin is set we explicitly choose to ignore an invalid
+                        * pxmin because this means that backend has no snapshot and
+                        * cannot get another one while we hold exclusive lock.
+                        */
+                       if (!TransactionIdIsValid(limitXmin) ||
+                               (TransactionIdPrecedes(pxmin, limitXmin) && TransactionIdIsValid(pxmin)))
+                       {
+                               VirtualTransactionId vxid;
+
+                               GET_VXID_FROM_PGPROC(vxid, *proc);
+                               if (VirtualTransactionIdIsValid(vxid))
+                                       vxids[count++] = vxid;
+                       }
+               }
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       /* add the terminator */
+       vxids[count].backendId = InvalidBackendId;
+       vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+       return vxids;
+}
+
+PGPROC *
+VirtualTransactionIdGetProc(VirtualTransactionId vxid)
+{
+       ProcArrayStruct *arrayP = procArray;
+       PGPROC          *result = NULL;
+       int                     index;
+
+       if (!VirtualTransactionIdIsValid(vxid))
+               return NULL;
+
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               VirtualTransactionId procvxid;
+               PGPROC     *proc = arrayP->procs[index];
+
+               GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+               if (procvxid.backendId == vxid.backendId &&
+                       procvxid.localTransactionId == vxid.localTransactionId)
+               {
+                       result = proc;
+                       break;
+               }
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       return result;
+}
 
 /*
  * CountActiveBackends --- count backends (other than myself) that are in
@@ -1111,7 +1903,7 @@ CountActiveBackends(void)
                if (proc == MyProc)
                        continue;                       /* do not count myself */
                if (proc->pid == 0)
-                       continue;                       /* do not count prepared xacts */
+                       continue;                       /* do not count prepared xacts or recovery procs */
                if (proc->xid == InvalidTransactionId)
                        continue;                       /* do not count if no XID assigned */
                if (proc->waitLock != NULL)
@@ -1139,7 +1931,7 @@ CountDBBackends(Oid databaseid)
                volatile PGPROC *proc = arrayP->procs[index];
 
                if (proc->pid == 0)
-                       continue;                       /* do not count prepared xacts */
+                       continue;                       /* do not count prepared xacts or recovery procs */
                if (proc->databaseId == databaseid)
                        count++;
        }
@@ -1166,7 +1958,7 @@ CountUserBackends(Oid roleid)
                volatile PGPROC *proc = arrayP->procs[index];
 
                if (proc->pid == 0)
-                       continue;                       /* do not count prepared xacts */
+                       continue;                       /* do not count prepared xacts or recovery procs */
                if (proc->roleId == roleid)
                        count++;
        }
@@ -1207,6 +1999,9 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
        int                     autovac_pids[MAXAUTOVACPIDS];
        int                     tries;
 
+       /* Gives wrong answer in recovery, so make sure we don't use it */
+       Assert(!IsRecoveryProcessingMode());
+
        /* 50 tries with 100ms sleep between tries makes 5 sec total wait */
        for (tries = 0; tries < 50; tries++)
        {
@@ -1367,3 +2162,243 @@ DisplayXidCache(void)
 }
 
 #endif   /* XIDCACHE_DEBUG */
+
+/* ----------------------------------------------
+ *             UnobservedTransactions sub-module
+ * ----------------------------------------------
+ *
+ * All functions must be called holding ProcArrayLock.
+ */
+
+/*
+ * Add unobserved xids to end of UnobservedXids array
+ */
+void
+UnobservedTransactionsAddXids(TransactionId firstXid, TransactionId lastXid)
+{
+       TransactionId   ixid = firstXid;
+       int                     index = procArray->numUnobservedXids;
+       TransactionId *UnobservedXids;
+
+       UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+       Assert(TransactionIdIsNormal(firstXid));
+       Assert(TransactionIdIsNormal(lastXid));
+       Assert(TransactionIdPrecedes(firstXid, lastXid));
+
+       /*
+        * UnobservedXids is maintained as a ascending list of xids, with no gaps.
+        * Incoming xids are always higher than previous entries, so we just add
+        * them directly to the end of the array.
+        */
+       while (ixid != lastXid)
+       {
+               /*
+                * check to see if we have space to store more UnobservedXids
+                */
+               if (index >= procArray->maxUnobservedXids)
+               {
+                       UnobservedTransactionsDisplay(WARNING);
+                       elog(FATAL, "no more room in UnobservedXids array");
+               }
+
+               /*
+                * append ixid to UnobservedXids
+                */
+#ifdef USE_ASSERT_CHECKING
+               if (TransactionIdIsValid(UnobservedXids[index]))
+               {
+                       UnobservedTransactionsDisplay(LOG);
+                       elog(FATAL, "unobservedxids leak: adding xid %u onto existing entry %d", 
+                                                                       ixid, UnobservedXids[index]);  
+               }
+
+               if ((index > 0 && TransactionIdPrecedes(ixid, UnobservedXids[index - 1])))
+               {
+                       UnobservedTransactionsDisplay(LOG);
+                       elog(FATAL, "UnobservedXids leak: adding xid %u out of order at index %d", 
+                                                                       ixid, index);  
+               }
+#endif
+
+               elog(trace_recovery(DEBUG4), "adding unobservedxid %u (numxids %d min %u max %u)", 
+                                                                               ixid, procArray->numUnobservedXids,
+                                                                               UnobservedXids[0],
+                                                                               UnobservedXids[procArray->numUnobservedXids]);
+               UnobservedXids[index] = ixid;
+               index++;
+
+               TransactionIdAdvance(ixid);
+       }
+
+       procArray->numUnobservedXids = index;
+}
+
+/*
+ * Remove one unobserved xid from anywhere on UnobservedXids array.
+ * If xid has already been pruned away, no need to report as missing.
+ */
+void
+UnobservedTransactionsRemoveXid(TransactionId xid, bool missing_is_error)
+{
+       int                     index;
+       bool                    found = false;
+       TransactionId   *UnobservedXids;
+
+       UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+       /* 
+        * If we haven't initialised array yet, or if we've already cleared it
+        * ignore this and get on with it. If it's missing after this it is an
+        * ERROR if removal is requested and the value isn't present.
+        */
+       if (procArray->numUnobservedXids == 0 ||
+               (procArray->numUnobservedXids > 0 && 
+               TransactionIdPrecedes(xid, UnobservedXids[0])))
+               return;
+
+       elog(trace_recovery(DEBUG4), "remove unobservedxid %u (numxids %d min %u max %u)", 
+                                                                               xid, procArray->numUnobservedXids,
+                                                                               UnobservedXids[0],
+                                                                               UnobservedXids[procArray->numUnobservedXids]);
+
+       /*
+        * Locate our xid, and if found shunt others sideways to close the gap.
+        */
+       for (index = 0; index < procArray->numUnobservedXids; index++)
+       {
+               if (!found)
+               {
+                       if (UnobservedXids[index] == xid)
+                               found = true;
+               }
+               else
+               {
+                       UnobservedXids[index - 1] = UnobservedXids[index];
+               }
+       }
+
+       if (found)
+       {
+               UnobservedXids[--procArray->numUnobservedXids] = InvalidTransactionId;
+       }
+
+       if (!found && missing_is_error)
+       {
+               UnobservedTransactionsDisplay(LOG);
+               elog(ERROR, "could not remove unobserved xid = %d", xid);
+       }
+}
+
+/*
+ * Prune array up to a particular limit. This frequently means clearing the
+ * whole array, so check for that first.
+ */
+void
+UnobservedTransactionsPruneXids(TransactionId limitXid)
+{
+       int                     index;
+       int                             pruneUpToThisIndex = 0;
+       TransactionId   *UnobservedXids;
+
+       UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+       if (TransactionIdFollowsOrEquals(limitXid, UnobservedXids[procArray->numUnobservedXids]))
+       {
+               UnobservedTransactionsClearXids();
+               return;
+       }
+
+       elog(trace_recovery(DEBUG4), "prune unobservedxids up to %u (numxids %d min %u max %u)", 
+                                                                               limitXid, 
+                                                                               procArray->numUnobservedXids,
+                                                                               UnobservedXids[0],
+                                                                               UnobservedXids[procArray->numUnobservedXids]);
+
+       for (index = 0; index < procArray->numUnobservedXids; index++)
+       {
+               if (TransactionIdFollowsOrEquals(limitXid, UnobservedXids[index]))
+                       pruneUpToThisIndex = index + 1;
+               else 
+               {
+                       /*
+                        * Anything to delete?
+                        */
+                       if (pruneUpToThisIndex == 0)
+                               return;
+
+                       /*
+                        * Move unpruned values to start of array
+                        */
+                       UnobservedXids[index - pruneUpToThisIndex] = UnobservedXids[index];
+                       UnobservedXids[index] = 0;
+               }
+       }
+
+       procArray->numUnobservedXids -= pruneUpToThisIndex;
+}
+
+/*
+ * Clear the whole array.
+ */
+void
+UnobservedTransactionsClearXids(void)
+{
+       int                     index;
+       TransactionId   *UnobservedXids;
+
+       elog(trace_recovery(DEBUG4), "clear UnobservedXids");
+       UnobservedTransactionsDisplay(DEBUG4);
+
+       UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+       /* 
+        * UnobservedTransactionsAddXids() asserts that array will be empty
+        * when we add new values. so it must be zeroes here each time.
+        * That needs to be fast and accurate, this can be slowish.
+        */
+       for (index = 0; index < procArray->numUnobservedXids; index++)
+       {
+               UnobservedXids[index] = 0;
+       }
+
+       procArray->numUnobservedXids = 0;
+}
+
+void
+UnobservedTransactionsDisplay(int trace_level)
+{
+       int                             index;
+       TransactionId   *UnobservedXids;
+       StringInfoData buf;
+
+       initStringInfo(&buf);
+
+       UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+       for (index = 0; index < procArray->numUnobservedXids; index++)
+       {
+               if (TransactionIdIsValid(UnobservedXids[index]))
+                       appendStringInfo(&buf, "%u ", UnobservedXids[index]);
+       }
+
+       elog(trace_level, "%d unobserved xids %s", procArray->numUnobservedXids, buf.data);
+
+       pfree(buf.data);
+}
+
+bool
+XidInUnobservedTransactions(TransactionId xid)
+{
+       int                             index;
+       TransactionId   *UnobservedXids;
+
+       UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+       for (index = 0; index < procArray->numUnobservedXids; index++)
+       {
+               if (UnobservedXids[index] == xid)
+                       return true;
+       }
+       return false;
+}
index cb4e0a942bfcdfda7196cb6d247f2908eb18581e..8e0a60f12055d914661b423d5f47972e561cc593 100644 (file)
@@ -142,6 +142,7 @@ typedef struct ProcState
        int                     nextMsgNum;             /* next message number to read */
        bool            resetState;             /* backend needs to reset its state */
        bool            signaled;               /* backend has been sent catchup signal */
+       bool            sendOnly;               /* backend only sends, never receives */
 
        /*
         * Next LocalTransactionId to use for each idle backend slot.  We keep
@@ -248,7 +249,7 @@ CreateSharedInvalidationState(void)
  *             Initialize a new backend to operate on the sinval buffer
  */
 void
-SharedInvalBackendInit(void)
+SharedInvalBackendInit(bool sendOnly)
 {
        int                     index;
        ProcState  *stateP = NULL;
@@ -307,6 +308,7 @@ SharedInvalBackendInit(void)
        stateP->nextMsgNum = segP->maxMsgNum;
        stateP->resetState = false;
        stateP->signaled = false;
+       stateP->sendOnly = sendOnly;
 
        LWLockRelease(SInvalWriteLock);
 
@@ -578,7 +580,9 @@ SICleanupQueue(bool callerHasWriteLock, int minFree)
        /*
         * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify
         * the furthest-back backend that needs signaling (if any), and reset
-        * any backends that are too far back.
+        * any backends that are too far back. Note that because we ignore
+        * sendOnly backends here it is possible for them to keep sending
+        * messages without a problem even when they are the only active backend.
         */
        min = segP->maxMsgNum;
        minsig = min - SIG_THRESHOLD;
@@ -590,7 +594,7 @@ SICleanupQueue(bool callerHasWriteLock, int minFree)
                int             n = stateP->nextMsgNum;
 
                /* Ignore if inactive or already in reset state */
-               if (stateP->procPid == 0 || stateP->resetState)
+               if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly)
                        continue;
 
                /*
index 7c8b1f5aace81a2a73604363bf7e9a8961c172c4..b9cd501f6c1e5491641b9a34c2b3cfb82a67f13b 100644 (file)
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/twophase_rmgr.h"
+#include "access/xact.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "pgstat.h"
+#include "storage/sinval.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/resowner.h"
@@ -490,6 +492,15 @@ LockAcquire(const LOCKTAG *locktag,
        if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
                elog(ERROR, "unrecognized lock mode: %d", lockmode);
 
+       if (IsRecoveryProcessingMode() && 
+               locktag->locktag_type == LOCKTAG_OBJECT &&
+               lockmode > AccessShareLock)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("cannot acquire lockmode %s on database objects while recovery is in progress", 
+                                                                       lockMethodTable->lockModeNames[lockmode]),
+                                errhint("Only AccessShareLock can be acquired on database objects during recovery.")));
+
 #ifdef LOCK_DEBUG
        if (LOCK_DEBUG_ENABLED(locktag))
                elog(LOG, "LockAcquire: lock [%u,%u] %s",
@@ -817,6 +828,54 @@ LockAcquire(const LOCKTAG *locktag,
 
        LWLockRelease(partitionLock);
 
+       /*
+        * We made it all the way here. We've got the lock and we've got
+        * it for the first time in this transaction. So now it's time
+        * to send a WAL message so that standby servers can see this event,
+        * if its an AccessExclusiveLock on a relation. 
+        */
+       if (!IsRecoveryProcessingMode() && lockmode >= AccessExclusiveLock && 
+               locktag->locktag_type == LOCKTAG_RELATION)
+       {
+               XLogRecData             rdata;
+               xl_rel_lock             xlrec;
+               TransactionId   xid;
+
+               /*
+                * First thing we do is ensure that a TransactionId has been
+                * assigned to this transaction. We don't actually need the xid
+                * but if we don't do this then RecordTransactionCommit() and
+                * RecordTransactionAbort() will optimise away the transaction
+                * completion record which recovery relies upon to release locks.
+                * It's a hack, but for a corner case not worth adding code for 
+                * into the main commit path.
+                */
+               xid = GetTopTransactionId();
+               Assert(TransactionIdIsValid(xid));
+
+               Assert(OidIsValid(locktag->locktag_field2));
+
+               START_CRIT_SECTION();
+
+               /* 
+                * Decode the locktag back to the original values, to avoid
+                * sending lots of empty bytes with every message.  See
+                * lock.h to check how a locktag is defined  for LOCKTAG_RELATION
+                */
+               xlrec.xid = xid;
+               xlrec.dbOid = locktag->locktag_field1;
+               xlrec.relOid = locktag->locktag_field2;
+
+               rdata.data = (char *) (&xlrec);
+               rdata.len = sizeof(xl_rel_lock);
+               rdata.buffer = InvalidBuffer;
+               rdata.next = NULL;
+
+               (void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_LOCK, &rdata);
+
+               END_CRIT_SECTION();
+       }
+
        return LOCKACQUIRE_OK;
 }
 
index f2ccbe14e7cc8b4ec767b93830440d04ab243625..ea55be4f4866841562833213fe69a7af16139396 100644 (file)
@@ -693,3 +693,18 @@ LWLockHeldByMe(LWLockId lockid)
        }
        return false;
 }
+
+void
+PrintLWLocksHeldByMe(void)
+{
+       int                     i;
+
+       for (i = 0; i < num_held_lwlocks; i++)
+               elog(LOG, "leak held_lwlocks[%d] = %d", i, held_lwlocks[i]);
+}
+
+int
+NumLWLocksHeldByMe(void)
+{
+       return num_held_lwlocks;
+}
index 9e871eff92a0e6d8d4418b9fe0fd61a6930d7ab6..489c9a07d52e5b3b08f110e942d83c60601a5a02 100644 (file)
  *
  * ProcKill -- destroys the shared memory state (and locks)
  * associated with the process.
+ *
+ * In 8.4 we introduce the idea of recovery procs which hold state
+ * information for transactions currently being replayed. Many of the
+ * functions here apply only real procs representing connected users.
  */
 #include "postgres.h"
 
@@ -103,6 +107,8 @@ ProcGlobalShmemSize(void)
        size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGPROC)));
        /* MyProcs, including autovacuum */
        size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC)));
+       /* RecoveryProcs, including recovery actions by autovacuum */
+       size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC)));
        /* ProcStructLock */
        size = add_size(size, sizeof(slock_t));
 
@@ -172,6 +178,7 @@ InitProcGlobal(void)
         */
        ProcGlobal->freeProcs = NULL;
        ProcGlobal->autovacFreeProcs = NULL;
+       ProcGlobal->freeRecoveryProcs = NULL;
 
        ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY;
 
@@ -204,6 +211,35 @@ InitProcGlobal(void)
                ProcGlobal->autovacFreeProcs = &procs[i];
        }
 
+       /* 
+        * Create enough recovery procs so there is a shadow proc for every
+        * proc on the master, including both normal procs, autovac procs
+        * and anything else that might run transactions and write WAL.
+        * Bgwriter writes WAL but does not have a TransactionId, so ignore.
+        * We use the same procs for prepared transactions whether we are
+        * in recovery or not, so no space required for them either.
+        * 
+        * Recovery procs are just ghosts which store just enough information 
+        * to make them look real to anyone requesting a snapshot from the 
+        * procarray. So recovery procs don't need semaphores because they 
+        * aren't actually performing any work.
+        *
+        * Although the recovery procs tie up some shared memory they will
+        * not be part of the ProcArray once the database has fully started
+        * up, so there is little performance effect during normal running.
+        */
+       procs = (PGPROC *) ShmemAlloc((MaxBackends) * sizeof(PGPROC));
+       if (!procs)
+               ereport(FATAL,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of shared memory")));
+       MemSet(procs, 0, MaxBackends * sizeof(PGPROC));
+       for (i = 0; i < MaxBackends; i++)
+       {
+               procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeRecoveryProcs;
+               ProcGlobal->freeRecoveryProcs = &procs[i];
+       }
+
        MemSet(AuxiliaryProcs, 0, NUM_AUXILIARY_PROCS * sizeof(PGPROC));
        for (i = 0; i < NUM_AUXILIARY_PROCS; i++)
        {
@@ -342,7 +378,7 @@ InitProcessPhase2(void)
        /*
         * Add our PGPROC to the PGPROC array in shared memory.
         */
-       ProcArrayAdd(MyProc);
+       ProcArrayAdd(MyProc, true);
 
        /*
         * Arrange to clean that up at backend exit.
@@ -363,6 +399,11 @@ InitProcessPhase2(void)
  * to the ProcArray or the sinval messaging mechanism, either. They also
  * don't get a VXID assigned, since this is only useful when we actually
  * hold lockmgr locks.
+ *
+ * Startup process however uses locks but never waits for them in the
+ * normal backend sense. Startup process also takes part in sinval messaging
+ * as a sendOnly process, so never reads messages from sinval queue. So
+ * Startup process does have a VXID and does show up in pg_locks.
  */
 void
 InitAuxiliaryProcess(void)
@@ -451,6 +492,153 @@ InitAuxiliaryProcess(void)
        on_shmem_exit(AuxiliaryProcKill, Int32GetDatum(proctype));
 }
 
+/*
+ * InitRecoveryProcess -- initialize a per-master process data structure
+ *                                                     for use when emulating transactions in recovery
+ *
+ * Note: returns NULL if no proc was available - this is not an error, it
+ * will just force a change of state in the proc array.
+ */
+PGPROC *
+InitRecoveryProcess(TransactionId xid)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile PROC_HDR *procglobal = ProcGlobal;
+       PGPROC          *ThisProc = NULL;
+
+       /*
+        * ProcGlobal should be set up already (if we are a backend, we inherit
+        * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+        */
+       if (procglobal == NULL)
+               elog(PANIC, "proc header uninitialized");
+
+       /*
+        * Try to get a proc struct from the free list.  If this fails, we must be
+        * out of PGPROC structures.
+        */
+       SpinLockAcquire(ProcStructLock);
+
+       ThisProc = procglobal->freeRecoveryProcs;
+
+       if (ThisProc != NULL)
+       {
+               procglobal->freeRecoveryProcs = (PGPROC *) ThisProc->links.next;
+               SpinLockRelease(ProcStructLock);
+       }
+       else
+       {
+               SpinLockRelease(ProcStructLock);
+
+               /*
+                * If we did throw an ERROR, it would be here
+                */
+               return NULL;
+       }
+
+       /*
+        * We haven't added it to proc array yet, so no locking required here.
+        */
+       ThisProc->xid = xid;
+
+       /* 
+        * The following are not used for recovery procs
+        */
+       ThisProc->backendId = InvalidBackendId;
+       ThisProc->pid = 0;
+       ThisProc->waitStatus = STATUS_OK;
+       ThisProc->lxid = InvalidLocalTransactionId;
+       ThisProc->xmin = InvalidTransactionId;
+       ThisProc->databaseId = InvalidOid;
+       ThisProc->roleId = InvalidOid;
+       ThisProc->inCommit = false;
+       ThisProc->vacuumFlags = 0;
+       ThisProc->lwWaiting = false;
+       ThisProc->lwExclusive = false;
+       ThisProc->lwWaitLink = NULL;
+       ThisProc->waitLock = NULL;
+       ThisProc->waitProcLock = NULL;
+
+       /*
+        * There is little else to do. The recovery proc is never used to
+        * acquire buffers, nor will we ever acquire LWlocks using the proc.
+        * Deadlock checker is not active during recovery.
+        */
+       return ThisProc;
+}
+
+void
+FreeRecoveryProcess(PGPROC *proc)
+{
+       volatile PROC_HDR *procglobal = ProcGlobal;
+
+       SpinLockAcquire(ProcStructLock);
+
+       /* Return struct to freelist */
+       proc->links.next = (SHM_QUEUE *) procglobal->freeRecoveryProcs;
+       procglobal->freeRecoveryProcs = proc;
+
+       SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Additional initialisation for Startup process
+ */
+void
+PublishStartupProcessInformation(void)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile PROC_HDR *procglobal = ProcGlobal;
+
+       SpinLockAcquire(ProcStructLock);
+
+       /*
+        * Record Startup process information, for use in ProcSendSignal().
+        * See comments there for further explanation.
+        */ 
+       procglobal->startupProc = MyProc;
+       procglobal->startupProcPid = MyProcPid;
+
+       SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Set recovery conflict information for a single proc. 
+ */
+void
+ProcSetRecoveryConflict(PGPROC *proc, XLogRecPtr conflict_LSN, int cancel_mode)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile PGPROC *vproc = proc;
+
+       SpinLockAcquire(ProcStructLock);
+
+       vproc->recoveryConflictLSN = conflict_LSN;
+       vproc->recoveryConflictCancelMode = cancel_mode;
+
+       SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Get recovery conflict information for a single proc. 
+ */
+XLogRecPtr
+ProcGetRecoveryConflict(int *cancel_mode)
+{
+       XLogRecPtr conflict_LSN;
+
+       volatile PGPROC *vproc = MyProc;
+
+       SpinLockAcquire(ProcStructLock);
+
+       conflict_LSN = vproc->recoveryConflictLSN;
+       *cancel_mode = vproc->recoveryConflictCancelMode;
+
+       SpinLockRelease(ProcStructLock);
+
+       return conflict_LSN;
+}
+
 /*
  * Check whether there are at least N free PGPROC objects.
  *
@@ -565,17 +753,21 @@ ProcReleaseLocks(bool isCommit)
 
 /*
  * RemoveProcFromArray() -- Remove this process from the shared ProcArray.
+ *
+ * Only intended for use with real procs, not recovery procs.
  */
 static void
 RemoveProcFromArray(int code, Datum arg)
 {
        Assert(MyProc != NULL);
-       ProcArrayRemove(MyProc, InvalidTransactionId);
+       ProcArrayRemove(MyProc, InvalidTransactionId, 0, NULL);
 }
 
 /*
  * ProcKill() -- Destroy the per-proc data structure for
  *             this process. Release any of its held LW locks.
+ *
+ * Only intended for use with real procs, not recovery procs.
  */
 static void
 ProcKill(int code, Datum arg)
@@ -1271,7 +1463,31 @@ ProcWaitForSignal(void)
 void
 ProcSendSignal(int pid)
 {
-       PGPROC     *proc = BackendPidGetProc(pid);
+       PGPROC     *proc = NULL;
+
+       if (IsRecoveryProcessingMode())
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile PROC_HDR *procglobal = ProcGlobal;
+
+               SpinLockAcquire(ProcStructLock);
+
+               /*
+                * Check to see whether it is the Startup process we wish to signal.
+                * This call is made by the buffer manager when it wishes to wake
+                * up a process that has been waiting for a pin in so it can obtain a
+                * cleanup lock using LockBufferForCleanup(). Startup is not a normal 
+                * backend, so BackendPidGetProc() will not return any pid at all. 
+                * So we remember the information for this special case.
+                */
+               if (pid == procglobal->startupProcPid)
+                       proc = procglobal->startupProc;
+
+               SpinLockRelease(ProcStructLock);
+       }
+
+       if (proc == NULL) 
+               proc = BackendPidGetProc(pid);
 
        if (proc != NULL)
                PGSemaphoreUnlock(&proc->sem);
index 3781b55be899c6d65c786ec0e727b14417a1ad4a..de666acedf70635f47063cf40d80b2808f05f0ce 100644 (file)
@@ -2579,8 +2579,8 @@ StatementCancelHandler(SIGNAL_ARGS)
                 * the interrupt immediately.  No point in interrupting if we're
                 * waiting for input, however.
                 */
-               if (ImmediateInterruptOK && InterruptHoldoffCount == 0 &&
-                       CritSectionCount == 0 && !DoingCommandRead)
+               if (InterruptHoldoffCount == 0 && CritSectionCount == 0 && 
+                       (DoingCommandRead || ImmediateInterruptOK))
                {
                        /* bump holdoff count to make ProcessInterrupts() a no-op */
                        /* until we are done getting ready for it */
@@ -2660,10 +2660,37 @@ ProcessInterrupts(void)
                        ereport(ERROR,
                                        (errcode(ERRCODE_QUERY_CANCELED),
                                         errmsg("canceling autovacuum task")));
-               else
+               else 
+               {
+                       if (IsRecoveryProcessingMode())
+                       {
+                               int             cancel_mode = 0;
+                               XLogRecPtr      conflict_LSN = ProcGetRecoveryConflict(&cancel_mode);
+
+                               switch (cancel_mode)
+                               {
+                                       case FATAL:
+                                                       ereport(FATAL,
+                                                               (errcode(ERRCODE_QUERY_CANCELED),
+                                                                errmsg("canceling session due to conflict with recovery")));
+                                       case ERROR:
+                                                       if (XLogRecPtrIsValid(conflict_LSN))
+                                                               SetBufferRecoveryConflictLSN(conflict_LSN);
+                                                       else
+                                                               ereport(ERROR,
+                                                                       (errcode(ERRCODE_QUERY_CANCELED),
+                                                                        errmsg("canceling statement due to conflict with recovery")));
+                                                       return; 
+                                       default:
+                                                       /* No conflict pending, so fall through */
+                                                       break;
+                               }
+                       }
+
                        ereport(ERROR,
                                        (errcode(ERRCODE_QUERY_CANCELED),
                                         errmsg("canceling statement due to user request")));
+               }
        }
        /* If we get here, do nothing (probably, QueryCancelPending was reset) */
 }
index 751d7deaa5a1061aff4509dce90b556107e4cbcd..e7ad3faaca108c2db70be81d16aea4a533ee0c76 100644 (file)
@@ -287,10 +287,22 @@ ProcessUtility(Node *parsetree,
                                                                        SetPGVariable("transaction_isolation",
                                                                                                  list_make1(item->arg),
                                                                                                  true);
+                                                               
                                                                else if (strcmp(item->defname, "transaction_read_only") == 0)
+                                                               {
+                                                                       A_Const    *con;
+
+                                                                       Assert(IsA(item->arg, A_Const));
+                                                                       con = (A_Const *) item->arg;
+                                                                       Assert(nodeTag(&con->val) == T_Integer);
+
+                                                                       if (!intVal(&con->val))
+                                                                               PreventCommandDuringRecovery();
+
                                                                        SetPGVariable("transaction_read_only",
                                                                                                  list_make1(item->arg),
                                                                                                  true);
+                                                               }
                                                        }
                                                }
                                                break;
@@ -305,6 +317,7 @@ ProcessUtility(Node *parsetree,
                                                break;
 
                                        case TRANS_STMT_PREPARE:
+                                               PreventCommandDuringRecovery();
                                                if (!PrepareTransactionBlock(stmt->gid))
                                                {
                                                        /* report unsuccessful commit in completionTag */
@@ -314,11 +327,13 @@ ProcessUtility(Node *parsetree,
                                                break;
 
                                        case TRANS_STMT_COMMIT_PREPARED:
+                                               PreventCommandDuringRecovery();
                                                PreventTransactionChain(isTopLevel, "COMMIT PREPARED");
                                                FinishPreparedTransaction(stmt->gid, true);
                                                break;
 
                                        case TRANS_STMT_ROLLBACK_PREPARED:
+                                               PreventCommandDuringRecovery();
                                                PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED");
                                                FinishPreparedTransaction(stmt->gid, false);
                                                break;
@@ -676,6 +691,7 @@ ProcessUtility(Node *parsetree,
                        break;
 
                case T_GrantStmt:
+                       PreventCommandDuringRecovery();
                        ExecuteGrantStmt((GrantStmt *) parsetree);
                        break;
 
@@ -846,6 +862,7 @@ ProcessUtility(Node *parsetree,
                case T_NotifyStmt:
                        {
                                NotifyStmt *stmt = (NotifyStmt *) parsetree;
+                               PreventCommandDuringRecovery();
 
                                Async_Notify(stmt->conditionname);
                        }
@@ -854,6 +871,7 @@ ProcessUtility(Node *parsetree,
                case T_ListenStmt:
                        {
                                ListenStmt *stmt = (ListenStmt *) parsetree;
+                               PreventCommandDuringRecovery();
 
                                Async_Listen(stmt->conditionname);
                        }
@@ -862,6 +880,7 @@ ProcessUtility(Node *parsetree,
                case T_UnlistenStmt:
                        {
                                UnlistenStmt *stmt = (UnlistenStmt *) parsetree;
+                               PreventCommandDuringRecovery();
 
                                if (stmt->conditionname)
                                        Async_Unlisten(stmt->conditionname);
@@ -881,10 +900,12 @@ ProcessUtility(Node *parsetree,
                        break;
 
                case T_ClusterStmt:
+                       PreventCommandDuringRecovery();
                        cluster((ClusterStmt *) parsetree, isTopLevel);
                        break;
 
                case T_VacuumStmt:
+                       PreventCommandDuringRecovery();
                        vacuum((VacuumStmt *) parsetree, InvalidOid, true, NULL, false,
                                   isTopLevel);
                        break;
@@ -1000,12 +1021,14 @@ ProcessUtility(Node *parsetree,
                                ereport(ERROR,
                                                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                                                 errmsg("must be superuser to do CHECKPOINT")));
+                       PreventCommandDuringRecovery();
                        RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
                        break;
 
                case T_ReindexStmt:
                        {
                                ReindexStmt *stmt = (ReindexStmt *) parsetree;
+                               PreventCommandDuringRecovery();
 
                                switch (stmt->kind)
                                {
@@ -2490,3 +2513,12 @@ GetCommandLogLevel(Node *parsetree)
 
        return lev;
 }
+
+void
+PreventCommandDuringRecovery(void)
+{
+       if (IsRecoveryProcessingMode())
+               ereport(ERROR,
+                       (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
+                        errmsg("cannot be run until recovery completes")));
+}
index 7e51f9e2ad2fc375e3744203bac01fe44a3dbcae..81814e6e2b77dffa2e60b13e737addc14c73dc44 100644 (file)
@@ -338,6 +338,12 @@ txid_current(PG_FUNCTION_ARGS)
        txid            val;
        TxidEpoch       state;
 
+       if (IsRecoveryProcessingMode())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("cannot assign txid while recovery is in progress"),
+                                errhint("only read only queries can execute during recovery")));
+
        load_xid_epoch(&state);
 
        val = convert_xid(GetTopTransactionId(), &state);
index 9738fa1c31a8e417d7cfbda8357d12462b2d6b85..03a8ba372a3810eea3fabf2d50ffcaa868fbe8a9 100644 (file)
  */
 #include "postgres.h"
 
+#include <signal.h>
+
+#include "access/transam.h"
 #include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "storage/procarray.h"
+#include "storage/proc.h"
 #include "storage/sinval.h"
 #include "storage/smgr.h"
 #include "utils/inval.h"
@@ -155,6 +161,14 @@ typedef struct TransInvalidationInfo
 
 static TransInvalidationInfo *transInvalInfo = NULL;
 
+static SharedInvalidationMessage *SharedInvalidMessagesArray;
+static int                                     numSharedInvalidMessagesArray;
+static int                                     maxSharedInvalidMessagesArray;
+
+static List *RecoveryLockList;
+static MemoryContext   RelationLockContext;
+
+
 /*
  * Dynamically-registered callback functions.  Current implementation
  * assumes there won't be very many of these at once; could improve if needed.
@@ -741,6 +755,8 @@ AtStart_Inval(void)
                MemoryContextAllocZero(TopTransactionContext,
                                                           sizeof(TransInvalidationInfo));
        transInvalInfo->my_level = GetCurrentTransactionNestLevel();
+       SharedInvalidMessagesArray = NULL;
+       numSharedInvalidMessagesArray = 0;
 }
 
 /*
@@ -851,6 +867,126 @@ inval_twophase_postcommit(TransactionId xid, uint16 info,
        }
 }
 
+static void
+MakeSharedInvalidMessagesArray(const SharedInvalidationMessage *msgs, int n)
+{
+       /*
+        * Initialise array first time through in each commit
+        */
+       if (SharedInvalidMessagesArray == NULL)
+       {
+               maxSharedInvalidMessagesArray = FIRSTCHUNKSIZE;
+               numSharedInvalidMessagesArray = 0;
+
+               /*
+                * Although this is being palloc'd we don't actually free it directly.
+                * We're so close to EOXact that we now we're going to lose it anyhow.
+                */
+               SharedInvalidMessagesArray = palloc(maxSharedInvalidMessagesArray 
+                                                                                       * sizeof(SharedInvalidationMessage));
+       }
+       
+       if ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+       {
+               while ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+                       maxSharedInvalidMessagesArray *= 2;
+
+               SharedInvalidMessagesArray = repalloc(SharedInvalidMessagesArray,
+                                                                                       maxSharedInvalidMessagesArray 
+                                                                                       * sizeof(SharedInvalidationMessage));
+       }
+
+       /*
+        * Append the next chunk onto the array
+        */
+       memcpy(SharedInvalidMessagesArray + numSharedInvalidMessagesArray,
+                       msgs, n * sizeof(SharedInvalidationMessage));
+       numSharedInvalidMessagesArray += n;
+}
+
+/*
+ * xactGetCommittedInvalidationMessages() is executed by 
+ * RecordTransactionCommit() to add invalidation messages onto the
+ * commit record. This applies only to commit message types, never to
+ * abort records. Must always run before AtEOXact_Inval(), since that
+ * removes the data we need to see.
+ *
+ * Remember that this runs before we have officially committed, so we
+ * must not do anything here to change what might occur *if* we should
+ * fail between here and the actual commit.
+ *
+ * Note that transactional validation does *not* write a invalidation
+ * WAL message using XLOG_RELATION_INVAL messages. Those are only used
+ * by non-transactional invalidation. see comments in
+ * EndNonTransactionalInvalidation().
+ *
+ * see also xact_redo_commit() and xact_desc_commit()
+ */
+int
+xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, 
+                                                                               bool *RelcacheInitFileInval)
+{
+       MemoryContext oldcontext;
+
+       /* Must be at top of stack */
+       Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
+
+       /*
+        * Relcache init file invalidation requires processing both before and
+        * after we send the SI messages.  However, we need not do anything
+        * unless we committed.
+        */
+       if (transInvalInfo->RelcacheInitFileInval)
+               *RelcacheInitFileInval = true;
+       else
+               *RelcacheInitFileInval = false;
+
+       /*
+        * Walk through TransInvalidationInfo to collect all the messages
+        * into a single contiguous array of invalidation messages. It must
+        * be contiguous so we can copy directly into WAL message. Maintain the
+        * order that they would be processed in by AtEOXact_Inval(), to ensure
+        * emulated behaviour in redo is as similar as possible to original.
+        * We want the same bugs, if any, not new ones. 
+        */
+       oldcontext = MemoryContextSwitchTo(CurTransactionContext);
+
+       ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+                                                                        MakeSharedInvalidMessagesArray);
+       ProcessInvalidationMessagesMulti(&transInvalInfo->PriorCmdInvalidMsgs,
+                                                                        MakeSharedInvalidMessagesArray);
+       MemoryContextSwitchTo(oldcontext);
+
+#ifdef STANDBY_INVAL_DEBUG
+       if (numSharedInvalidMessagesArray > 0)
+       {
+               int i;
+       
+               elog(LOG, "numSharedInvalidMessagesArray = %d", numSharedInvalidMessagesArray);
+
+               Assert(SharedInvalidMessagesArray != NULL);
+
+               for (i = 0; i < numSharedInvalidMessagesArray; i++)
+               {
+                       SharedInvalidationMessage *msg = SharedInvalidMessagesArray + i;
+
+                       if (msg->id >= 0)
+                               elog(LOG, "catcache id %d", msg->id);
+                       else if (msg->id == SHAREDINVALRELCACHE_ID)
+                               elog(LOG, "relcache id %d", msg->id);
+                       else if (msg->id == SHAREDINVALSMGR_ID)
+                               elog(LOG, "smgr cache id %d", msg->id);
+               }
+       }
+#endif
+
+       if (numSharedInvalidMessagesArray > 0)
+               Assert(SharedInvalidMessagesArray != NULL);
+
+       *msgs = SharedInvalidMessagesArray;
+
+       return numSharedInvalidMessagesArray;
+}
 
 /*
  * AtEOXact_Inval
@@ -1041,6 +1177,42 @@ BeginNonTransactionalInvalidation(void)
        Assert(transInvalInfo->CurrentCmdInvalidMsgs.cclist == NULL);
        Assert(transInvalInfo->CurrentCmdInvalidMsgs.rclist == NULL);
        Assert(transInvalInfo->RelcacheInitFileInval == false);
+
+       SharedInvalidMessagesArray = NULL;
+       numSharedInvalidMessagesArray = 0;
+}
+
+/*
+ * General function to log the SharedInvalidMessagesArray. Only current 
+ * caller is EndNonTransactionalInvalidation(), but that may change.
+ */
+static void
+LogSharedInvalidMessagesArray(void)
+{
+       XLogRecData             rdata[2];
+       xl_rel_inval    xlrec;
+
+       if (numSharedInvalidMessagesArray == 0)
+               return;
+
+       START_CRIT_SECTION();
+
+       xlrec.nmsgs = numSharedInvalidMessagesArray;
+
+       rdata[0].data = (char *) (&xlrec);
+       rdata[0].len = MinSizeOfRelationInval;
+       rdata[0].buffer = InvalidBuffer;
+
+       rdata[0].next = &(rdata[1]);
+       rdata[1].data = (char *) SharedInvalidMessagesArray;
+       rdata[1].len = numSharedInvalidMessagesArray * 
+                                                               sizeof(SharedInvalidationMessage);
+       rdata[1].buffer = InvalidBuffer;
+       rdata[1].next = NULL;
+
+       (void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_INVAL, rdata);
+
+       END_CRIT_SECTION();
 }
 
 /*
@@ -1081,7 +1253,25 @@ EndNonTransactionalInvalidation(void)
        ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
                                                                         SendSharedInvalidMessages);
 
+       /* 
+        * Write invalidation messages to WAL. This is not required for
+        * recovery, it is only required for standby servers. It's fairly
+        * low overhead so don't worry. This allows us to trigger inval
+        * messages on the standby as soon as we see these records.
+        * see relation_redo_inval()
+        * 
+        * Note that transactional validation uses an array attached to
+        * a WAL commit record, so these messages are rare.
+        */
+       ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+                                                                        MakeSharedInvalidMessagesArray);
+       LogSharedInvalidMessagesArray();
+
        /* Clean up and release memory */
+
+       /* XXXHS: Think some more on memory allocation and freeing.
+        */
+
        for (chunk = transInvalInfo->CurrentCmdInvalidMsgs.cclist;
                 chunk != NULL;
                 chunk = next)
@@ -1235,3 +1425,455 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 
        ++relcache_callback_count;
 }
+
+/*
+ * -----------------------------------------------------
+ *             Standby wait timers and backend cancel logic
+ * -----------------------------------------------------
+ */
+
+static void
+InitStandbyDelayTimers(int *currentDelay_ms, int *standbyWait_ms)
+{
+       *currentDelay_ms = GetLatestReplicationDelay();
+
+       /*
+        * If replication delay is enormously huge, just treat that as
+        * zero and work up from there. This prevents us from acting
+        * foolishly when replaying old log files.
+        */
+       if (*currentDelay_ms < 0)
+               *currentDelay_ms = 0;
+
+#define STANDBY_INITIAL_WAIT_MS  1
+       *standbyWait_ms = STANDBY_INITIAL_WAIT_MS;
+}
+
+/*
+ * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
+ * We wait here for a while then return. If we decide we can't wait any
+ * more then we return true, if we can wait some more return false.
+ */
+static bool
+WaitExceedsMaxStandbyDelay(int *currentDelay_ms, int *standbyWait_ms)
+{
+       int             maxStandbyDelay_ms = maxStandbyDelay * 1000;
+
+       /*
+        * If the server is already further behind than we would
+        * like then no need to wait or do more complex logic.
+        * max_standby_delay = -1 means wait for ever, if necessary
+        */
+       if (maxStandbyDelay >= 0 &&
+               *currentDelay_ms >= maxStandbyDelay_ms)
+               return true;
+
+       /*
+        * Sleep, then do bookkeeping.
+        */
+       pg_usleep(*standbyWait_ms * 1000L);
+       *currentDelay_ms += *standbyWait_ms;
+
+       /*
+        * Progressively increase the sleep times.
+        */
+       *standbyWait_ms *= 2;
+       if (*standbyWait_ms > 1000)
+               *standbyWait_ms = 1000;
+
+       /*
+        * Re-test our exit criteria
+        */
+       if (maxStandbyDelay >= 0 &&
+               *currentDelay_ms >= maxStandbyDelay_ms)
+               return true;
+
+       return false;
+}
+
+/*
+ * This is the main executioner for any query backend that conflicts with
+ * recovery processing. Judgement has already been passed on it within
+ * a specific rmgr. Here we just issue the orders to the procs. The procs
+ * then throw the required error as instructed.
+ *
+ * We may ask for a specific cancel_mode, typically ERROR or FATAL.
+ *
+ * If we want an ERROR, we may defer that until the buffer manager
+ * sees a recently changed block. If we want this we must specify a 
+ * valid conflict_LSN.
+ */
+void
+ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+                                                                               char *reason, int cancel_mode, 
+                                                                               XLogRecPtr conflict_LSN)
+{
+       int                             standbyWait_ms;
+       int                     currentDelay_ms;
+       bool                    logged;
+       int                             wontDieWait = 1;
+
+       InitStandbyDelayTimers(&currentDelay_ms, &standbyWait_ms);
+       logged = false;
+
+    while (VirtualTransactionIdIsValid(*waitlist))
+    {
+               /*
+                * log that we have been waiting for a while now...
+                */
+               if (!logged && standbyWait_ms > 500)
+               {
+                       elog(trace_recovery(DEBUG5),
+                                       "virtual transaction %u/%u is blocking %s",
+                                               waitlist->backendId,
+                                               waitlist->localTransactionId, 
+                                               reason);
+                       logged = true;
+               }
+
+               if (ConditionalVirtualXactLockTableWait(*waitlist))
+               {
+                       waitlist++;
+                       InitStandbyDelayTimers(&currentDelay_ms, &standbyWait_ms);
+                       logged = false;
+               }
+               else if (WaitExceedsMaxStandbyDelay(&currentDelay_ms,
+                                                                                        &standbyWait_ms))
+               {
+                       /*
+                        * Now find out who to throw out of the balloon.
+                        */
+                       PGPROC *proc;
+
+                       Assert(VirtualTransactionIdIsValid(*waitlist));
+                       proc = VirtualTransactionIdGetProc(*waitlist);
+
+                       /*
+                        * Kill the pid if it's still here. If not, that's what we wanted
+                        * so ignore any errors.
+                        */
+                       if (proc)
+                       {
+                               /*
+                                * Startup process debug messages
+                                */
+                               switch (cancel_mode)
+                               {
+                                       case FATAL:
+                                               elog(trace_recovery(DEBUG2), 
+                                                       "recovery disconnects session with pid %d "
+                                                       "because of conflict with %s (current delay %d secs)",
+                                                               proc->pid, 
+                                                               reason,
+                                                               currentDelay_ms / 1000);
+                                                       break;
+                                       case ERROR:
+                                                       if (XLogRecPtrIsValid(conflict_LSN))
+                                                               elog(trace_recovery(DEBUG2), 
+                                                                       "recovery signals virtual transaction %u/%u pid %d "
+                                                                       "for deferred cancelation with LSN %X/%X "
+                                                                       "because of conflict with %s (current delay %d secs)",
+                                                                               waitlist->backendId,
+                                                                               waitlist->localTransactionId,
+                                                                               proc->pid,
+                                                                               conflict_LSN.xlogid,
+                                                                               conflict_LSN.xrecoff,
+                                                                               reason,
+                                                                               currentDelay_ms / 1000);
+                                                       else
+                                                               elog(trace_recovery(DEBUG2), 
+                                                                       "recovery cancels virtual transaction %u/%u pid %d "
+                                                                       "because of conflict with %s (current delay %d secs)",
+                                                                               waitlist->backendId,
+                                                                               waitlist->localTransactionId, 
+                                                                               proc->pid,
+                                                                               reason,
+                                                                               currentDelay_ms / 1000);
+                                                       break;
+                                       default:
+                                                       /* No conflict pending, so fall through */
+                                                       break;
+                               }
+
+                               Assert(proc->pid != 0);
+
+                               /*
+                                * Issue orders for the proc to read next time it receives SIGINT
+                                */
+                               ProcSetRecoveryConflict(proc, conflict_LSN, cancel_mode);
+
+                               /*
+                                * Do we expect it to talk? No, Mr. Bond, we expect it to die.
+                                */
+                               kill(proc->pid, SIGINT);
+
+                               /*
+                                * Wait, if the instruction is expected to complete quickly
+                                */
+                               if (!XLogRecPtrIsValid(conflict_LSN))
+                               {
+                                       /* wait awhile for it to die */
+                                       pg_usleep(wontDieWait * 5000L);
+                                       wontDieWait *= 2;
+                               }
+                       }
+               }
+    }
+}
+
+/*
+ * -----------------------------------------------------
+ * Locking in Recovery Mode
+ * -----------------------------------------------------
+ *
+ * All locks are held by the Startup process using a single virtual
+ * transaction. This implementation is both simpler and in some senses, 
+ * more correct. The locks held mean "some original transaction held 
+ * this lock, so query access is not allowed at this time". So the Startup
+ * process is the proxy by which the original locks are implemented.
+ *
+ * We only keep track of AccessExclusiveLocks, which are only ever held by
+ * one transaction on one relation, and don't worry about lock queuing.
+ * 
+ * We keep a single dynamically expandible locks list in local memory.
+ * List elements use type xl_rel_lock, since the WAL record type exactly
+ * matches the information that we need to keep track of.
+ *
+ * We use session locks rather than normal locks so we don't need 
+ * ResourceOwners.
+ */
+
+/* called by relation_redo_lock() */
+static void
+RelationAddRecoveryLock(xl_rel_lock *lockRequest)
+{
+       xl_rel_lock     *newlock;
+       LOCKTAG                 locktag;
+       MemoryContext   old_context;
+
+       elog(trace_recovery(DEBUG4), 
+                       "adding recovery lock: db %d rel %d",
+                               lockRequest->dbOid, lockRequest->relOid);
+
+       /*
+        * dbOid is InvalidOid when we are locking a shared relation.
+        */
+       Assert(OidIsValid(lockRequest->relOid));
+
+       if (RelationLockContext == NULL)
+        RelationLockContext = AllocSetContextCreate(TopMemoryContext,
+                                                                                                               "RelationLocks",
+                                                                                                               ALLOCSET_DEFAULT_MINSIZE,
+                                                                                                               ALLOCSET_DEFAULT_INITSIZE,
+                                                                                                               ALLOCSET_DEFAULT_MAXSIZE);
+
+       old_context = MemoryContextSwitchTo(RelationLockContext);
+       newlock = palloc(sizeof(xl_rel_lock));
+       MemoryContextSwitchTo(old_context);
+
+       newlock->xid = lockRequest->xid;
+       newlock->dbOid = lockRequest->dbOid;
+       newlock->relOid = lockRequest->relOid;
+       RecoveryLockList = lappend(RecoveryLockList, newlock);
+
+       /*
+        * Attempt to acquire the lock as requested.
+        */
+       SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
+
+       /*
+        * Waiting for lock to clear or kill anyone in our way. Not a
+        * completely foolproof way of getting the lock, but we cannot
+        * afford to sit and wait for the lock indefinitely. This is
+        * one reason to reduce strengths of various locks in 8.4.
+        */
+       while (LockAcquire(&locktag, AccessExclusiveLock, true, true) 
+                                                                                       == LOCKACQUIRE_NOT_AVAIL)
+       {
+               VirtualTransactionId *old_lockholders;
+
+               old_lockholders = GetLockConflicts(&locktag, AccessExclusiveLock);
+               ResolveRecoveryConflictWithVirtualXIDs(old_lockholders,
+                                                                                               "exclusive lock",
+                                                                                               ERROR,
+                                                                                               InvalidXLogRecPtr);
+       }
+}
+
+static void
+RelationRemoveRecoveryLocks(TransactionId xid)
+{
+       ListCell   *l;
+       LOCKTAG         locktag;
+       List            *deletionList = NIL;
+
+       /*
+        * Release all matching locks and identify list elements to remove
+        */
+       foreach(l, RecoveryLockList)
+       {
+               xl_rel_lock *lock = (xl_rel_lock *) lfirst(l);
+
+               elog(trace_recovery(DEBUG4), 
+                               "releasing recovery lock: xid %u db %d rel %d",
+                                               lock->xid, lock->dbOid, lock->relOid);
+
+               if (!TransactionIdIsValid(xid) || lock->xid == xid)
+               {
+                       SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
+                       if (!LockRelease(&locktag, AccessExclusiveLock, true))
+                               elog(trace_recovery(LOG),
+                                       "RecoveryLockList contains entry for lock "
+                                       "no longer recorded by lock manager "
+                                       "xid %u database %d relation %d",
+                                               lock->xid, lock->dbOid, lock->relOid);
+                       deletionList = lappend(deletionList, lock);
+               }
+       }
+
+       /*
+        * Now remove the elements from RecoveryLockList. We can't navigate
+        * the list at the same time as deleting multiple elements from it.
+        */
+       foreach(l, deletionList)
+       {
+               xl_rel_lock *lock = (xl_rel_lock *) lfirst(l);
+
+               RecoveryLockList = list_delete_ptr(RecoveryLockList, lock);
+               pfree(lock);
+       }
+}
+
+/*
+ * Called during xact_commit_redo() and xact_commit_abort when InArchiveRecovery
+ * to remove any AccessExclusiveLocks requested by a transaction.
+ *
+ * Remove the lock tree, starting at xid down, from the RecoveryLockList.
+ */
+void
+RelationReleaseRecoveryLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
+{
+       int i;
+
+       RelationRemoveRecoveryLocks(xid);
+
+       for (i = 0; i < nsubxids; i++)
+               RelationRemoveRecoveryLocks(subxids[i]);
+}
+
+/*
+ * Called at end of recovery and when we see a shutdown checkpoint.
+ */
+void
+RelationClearRecoveryLocks(void)
+{
+       elog(trace_recovery(DEBUG1), "clearing recovery locks");
+       RelationRemoveRecoveryLocks(InvalidTransactionId);
+}
+
+/*
+ * --------------------------------------------------
+ *             Recovery handling for Rmgr RM_RELATION_ID
+ * --------------------------------------------------
+ */
+
+/*
+ * Redo for relation lock messages
+ */
+static void
+relation_redo_lock(xl_rel_lock *xlrec)
+{
+       RelationAddRecoveryLock(xlrec);
+}
+
+/*
+ * Redo for relation invalidation messages
+ */
+static void
+relation_redo_inval(xl_rel_inval *xlrec)
+{
+       SharedInvalidationMessage *msgs = &(xlrec->msgs[0]);
+       int             nmsgs = xlrec->nmsgs;
+
+       Assert(nmsgs > 0);              /* else we should not have written a record */
+
+       /* 
+        * Smack them straight onto the queue and we're done. This is safe
+        * because the only writer of these messages is non-transactional
+        * invalidation.
+        */
+       SendSharedInvalidMessages(msgs, nmsgs);
+}
+
+void
+relation_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+       uint8           info = record->xl_info & ~XLR_INFO_MASK;
+
+       if (InArchiveRecovery)
+               (void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
+       if (info == XLOG_RELATION_INVAL)
+       {
+               xl_rel_inval *xlrec = (xl_rel_inval *) XLogRecGetData(record);
+
+               relation_redo_inval(xlrec);
+       }
+       else if (info == XLOG_RELATION_LOCK)
+       {
+               xl_rel_lock *xlrec = (xl_rel_lock *) XLogRecGetData(record);
+
+               relation_redo_lock(xlrec);
+       }
+       else
+               elog(PANIC, "relation_redo: unknown op code %u", info);
+}
+
+static void
+relation_desc_inval(StringInfo buf, xl_rel_inval *xlrec)
+{
+       SharedInvalidationMessage *msgs = &(xlrec->msgs[0]);
+       int                                                     nmsgs = xlrec->nmsgs;
+
+       appendStringInfo(buf, "nmsgs %d;", nmsgs);
+
+       if (nmsgs > 0)
+       {
+               int i;
+       
+               for (i = 0; i < nmsgs; i++)
+               {
+                       SharedInvalidationMessage *msg = msgs + i;
+
+                       if (msg->id >= 0)
+                               appendStringInfo(buf,  "catcache id %d", msg->id);
+                       else if (msg->id == SHAREDINVALRELCACHE_ID)
+                               appendStringInfo(buf,  "relcache ");
+                       else if (msg->id == SHAREDINVALSMGR_ID)
+                               appendStringInfo(buf,  "smgr ");
+               }
+       }
+}
+
+void
+relation_desc(StringInfo buf, uint8 xl_info, char *rec)
+{
+       uint8           info = xl_info & ~XLR_INFO_MASK;
+
+       if (info == XLOG_RELATION_INVAL)
+       {
+               xl_rel_inval *xlrec = (xl_rel_inval *) rec;
+
+               appendStringInfo(buf, "inval: ");
+               relation_desc_inval(buf, xlrec);
+       }
+       else if (info == XLOG_RELATION_LOCK)
+       {
+               xl_rel_lock *xlrec = (xl_rel_lock *) rec;
+
+               appendStringInfo(buf, "exclusive relation lock: xid %u db %d rel %d", 
+                                                               xlrec->xid, xlrec->dbOid, xlrec->relOid);
+       }
+       else
+               appendStringInfo(buf, "UNKNOWN");
+}
index a33c94ed67069e338e42b5daebb9eb39fdc5723b..67adc7afa6c6c843864bd35a4d8a775c27df18cc 100644 (file)
@@ -2579,3 +2579,20 @@ is_log_level_output(int elevel, int log_min_level)
 
        return false;
 }
+
+/*
+ * If trace_recovery_messages is set to make this visible, then show as LOG,
+ * else display as whatever level is set. It may still be shown, but only
+ * if log_min_messages is set lower than trace_recovery_messages.
+ *
+ * Intention is to keep this for at least the whole of the 8.4 production
+ * release, so we can more easily diagnose production problems in the field.
+ */
+int
+trace_recovery(int trace_level)
+{
+       if (trace_level >= trace_recovery_messages)
+               return LOG;
+
+       return trace_level;
+}
index 9dbc53c159d4883d902b30dd7abe2442647cf8eb..404a8f753c50707554bedb6350aaca10b9c2528c 100644 (file)
@@ -678,9 +678,10 @@ write_auth_file(Relation rel_authid, Relation rel_authmem)
 /*
  * This routine is called once during database startup, after completing
  * WAL replay if needed.  Its purpose is to sync the flat files with the
- * current state of the database tables.  This is particularly important
- * during PITR operation, since the flat files will come from the
- * base backup which may be far out of sync with the current state.
+ * current state of the database tables.  
+ *
+ * In 8.4 we also run this during xact_redo_commit() if the transaction
+ * wrote a new database or auth flat file. 
  *
  * In theory we could skip rebuilding the flat files if no WAL replay
  * occurred, but it seems best to just do it always.  We have to
@@ -716,8 +717,6 @@ BuildFlatFiles(bool database_only)
        /*
         * We don't have any hope of running a real relcache, but we can use the
         * same fake-relcache facility that WAL replay uses.
-        *
-        * No locking is needed because no one else is alive yet.
         */
        rel_db = CreateFakeRelcacheEntry(rnode);
        write_database_file(rel_db, true);
@@ -832,14 +831,14 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
        /* Okay to write the files */
        if (database_file_update_subid != InvalidSubTransactionId)
        {
-               database_file_update_subid = InvalidSubTransactionId;
+               /* reset database_file_update_subid later during commit */
                write_database_file(drel, false);
                heap_close(drel, NoLock);
        }
 
        if (auth_file_update_subid != InvalidSubTransactionId)
        {
-               auth_file_update_subid = InvalidSubTransactionId;
+               /* reset auth_file_update_subid later during commit */
                write_auth_file(arel, mrel);
                heap_close(arel, NoLock);
                heap_close(mrel, NoLock);
@@ -859,6 +858,30 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
        ForceSyncCommit();
 }
 
+/*
+ * Exported to allow transaction commit to set flags to perform flat file
+ * update in redo. Reset per-transaction flags. For abort case they were
+ * already set during AtEOXact_UpdateFlatFiles().
+ */
+bool
+AtEOXact_Database_FlatFile_Update_Needed(void)
+{
+       bool result = TransactionIdIsValid(database_file_update_subid);
+
+       database_file_update_subid = InvalidSubTransactionId;
+
+       return result;
+}
+
+bool
+AtEOXact_Auth_FlatFile_Update_Needed(void)
+{
+       bool result = TransactionIdIsValid(auth_file_update_subid);             
+
+       auth_file_update_subid = InvalidSubTransactionId;
+
+       return result;
+}
 
 /*
  * This routine is called during transaction prepare.
index cf98323d2a25732ceca3e187b7e1b24211d054dc..d39180bf6988a365cd97929c9fe7ebb2a7d7be8b 100644 (file)
@@ -440,7 +440,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
         */
        MyBackendId = InvalidBackendId;
 
-       SharedInvalBackendInit();
+       SharedInvalBackendInit(false);
 
        if (MyBackendId > MaxBackends || MyBackendId <= 0)
                elog(FATAL, "bad backend id: %d", MyBackendId);
@@ -489,9 +489,15 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
         * Start a new transaction here before first access to db, and get a
         * snapshot.  We don't have a use for the snapshot itself, but we're
         * interested in the secondary effect that it sets RecentGlobalXmin.
+        * If we are connecting during recovery, make sure the initial
+        * transaction is read only and force all subsequent transactions
+        * that way also.
         */
        if (!bootstrap)
        {
+               if (IsRecoveryProcessingMode())
+                       SetConfigOption("default_transaction_read_only", "true",
+                               PGC_POSTMASTER, PGC_S_OVERRIDE);
                StartTransactionCommand();
                (void) GetTransactionSnapshot();
        }
@@ -515,7 +521,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
         */
        if (!bootstrap)
                LockSharedObject(DatabaseRelationId, MyDatabaseId, 0,
-                                                RowExclusiveLock);
+                               (IsRecoveryProcessingMode() ? AccessShareLock : RowExclusiveLock));
 
        /*
         * Recheck the flat file copy of pg_database to make sure the target
index 90f077a3700d764bd6988031fb4c5e7911c00e05..bd44494062e3094873bbf2b39d83015149891055 100644 (file)
@@ -115,6 +115,8 @@ extern char *temp_tablespaces;
 extern bool synchronize_seqscans;
 extern bool fullPageWrites;
 
+int    trace_recovery_messages = DEBUG1; /* XXXHS set to LOG for production */
+
 #ifdef TRACE_SORT
 extern bool trace_sort;
 #endif
@@ -2634,6 +2636,16 @@ static struct config_enum ConfigureNamesEnum[] =
                assign_session_replication_role, NULL
        },
 
+       {
+               {"trace_recovery_messages", PGC_SUSET, LOGGING_WHEN,
+                       gettext_noop("Sets the message levels that are logged during recovery."),
+                       gettext_noop("Each level includes all the levels that follow it. The later"
+                                                " the level, the fewer messages are sent.")
+               },
+               &trace_recovery_messages,
+               DEBUG1, server_message_level_options, NULL, NULL
+       },
+
        {
                {"track_functions", PGC_SUSET, STATS_COLLECTOR,
                        gettext_noop("Collects function-level statistics on database activity."),
@@ -5501,8 +5513,19 @@ ExecSetVariableStmt(VariableSetStmt *stmt)
                                                SetPGVariable("transaction_isolation",
                                                                          list_make1(item->arg), stmt->is_local);
                                        else if (strcmp(item->defname, "transaction_read_only") == 0)
+                                       {
+                                               A_Const    *con;
+
+                                               Assert(IsA(item->arg, A_Const));
+                                               con = (A_Const *) item->arg;
+                                               Assert(nodeTag(&con->val) == T_Integer);
+
+                                               if (!intVal(&con->val))
+                                                       PreventCommandDuringRecovery();
+
                                                SetPGVariable("transaction_read_only",
                                                                          list_make1(item->arg), stmt->is_local);
+                                       }
                                        else
                                                elog(ERROR, "unexpected SET TRANSACTION element: %s",
                                                         item->defname);
@@ -5520,8 +5543,19 @@ ExecSetVariableStmt(VariableSetStmt *stmt)
                                                SetPGVariable("default_transaction_isolation",
                                                                          list_make1(item->arg), stmt->is_local);
                                        else if (strcmp(item->defname, "transaction_read_only") == 0)
+                                       {
+                                               A_Const    *con;
+
+                                               Assert(IsA(item->arg, A_Const));
+                                               con = (A_Const *) item->arg;
+                                               Assert(nodeTag(&con->val) == T_Integer);
+
+                                               if (!intVal(&con->val))
+                                                       PreventCommandDuringRecovery();
+                                               
                                                SetPGVariable("default_transaction_read_only",
                                                                          list_make1(item->arg), stmt->is_local);
+                                       }
                                        else
                                                elog(ERROR, "unexpected SET SESSION element: %s",
                                                         item->defname);
index 9992895941c93505ded4c5c04c4802403e7a90f8..f6e043399b37b83f91cd5f385627f1432d48064e 100644 (file)
@@ -27,6 +27,7 @@
 
 #include "access/transam.h"
 #include "access/xact.h"
+#include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/memutils.h"
@@ -433,7 +434,11 @@ static void
 SnapshotResetXmin(void)
 {
        if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+       {
                MyProc->xmin = InvalidTransactionId;
+               if (IsRecoveryProcessingMode())
+                       SetBufferRecoveryConflictLSN(InvalidXLogRecPtr);
+       }
 }
 
 /*
index dbfbb023aea3449ce896b90e8d56a891095e1a49..aa60d8c116894314d16d54486b9f5c77447d34fb 100644 (file)
@@ -86,7 +86,7 @@ static inline void
 SetHintBits(HeapTupleHeader tuple, Buffer buffer,
                        uint16 infomask, TransactionId xid)
 {
-       if (TransactionIdIsValid(xid))
+       if (!IsRecoveryProcessingMode() && TransactionIdIsValid(xid))
        {
                /* NB: xid must be known committed here! */
                XLogRecPtr      commitLSN = TransactionIdGetCommitLSN(xid);
@@ -1238,26 +1238,52 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
                return true;
 
        /*
-        * If the snapshot contains full subxact data, the fastest way to check
-        * things is just to compare the given XID against both subxact XIDs and
-        * top-level XIDs.      If the snapshot overflowed, we have to use pg_subtrans
-        * to convert a subxact XID to its parent XID, but then we need only look
-        * at top-level XIDs not subxacts.
+        * Our strategy for checking xids changed in 8.4. Prior to 8.4
+        * we either checked the subxid cache on the snapshot or we 
+        * checked subtrans. That was much more efficient than just using
+        * subtrans but it has some problems. First, as soon as *any*
+        * transaction had more than 64 transactions we forced *all*
+        * snapshots to check against subtrans, giving a sharp modal
+        * change in behaviour. Second because we either checked subtrans
+        * or the snapshot, we were forced to place entries in subtrans
+        * in case the snapshot later overflowed, even if we never
+        * actually checked subtrans.
+        *
+        * In 8.4 we improve on that scheme in a number of ways. As before
+        * we check subtrans if the snapshot has overflowed. We *also*
+        * check the subxid cache. This has two benefits: first the 
+        * behaviour degrades gracefully when the cache overflows, so we
+        * retain much of its benefit if it has only just overflowed.
+        * Second, a transaction doesn't need to insert entries into
+        * subtrans until its own personal subxid cache overflows. This
+        * means entries into subtrans become significantly rarer, 
+        * perhaps less than 1% of the previous insert rate, giving
+        * considerable benefit for transactions using only a few
+        * subtransactions.
+        *
+        * This behaviour is also necessary for allowing snapshots to work
+        * correctly on a standby server. By this subtle change of behaviour
+        * we can now utilise the subxid cache to store "unobserved xids"
+        * of which we can infer their existence from watching the 
+        * arrival sequence of newly observed transactionids in the WAL.
         */
-       if (snapshot->subxcnt >= 0)
-       {
-               /* full data, so search subxip */
-               int32           j;
 
-               for (j = 0; j < snapshot->subxcnt; j++)
-               {
-                       if (TransactionIdEquals(xid, snapshot->subxip[j]))
+       /*
+        * First, compare the given XID against cached subxact XIDs.
+        */
+       for (i = 0; i < snapshot->subxcnt; i++)
+       {
+               if (TransactionIdEquals(xid, snapshot->subxip[i]))
                                return true;
                }
 
-               /* not there, fall through to search xip[] */
-       }
-       else
+       /*
+        * If the snapshot overflowed and we haven't already located the xid
+        * we also have to consult pg_subtrans. We use subtrans to convert a 
+        * subxact XID to its parent XID, so that we can then check the status
+        * of the top-level TransactionId.
+        */
+       if (snapshot->suboverflowed)
        {
                /* overflowed, so convert xid to top-level */
                xid = SubTransGetTopmostTransaction(xid);
@@ -1270,6 +1296,10 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
                        return false;
        }
 
+       /*
+        * By now xid is either not present, or a top-level xid. So now
+        * we just need to check the main transaction ids.
+        */
        for (i = 0; i < snapshot->xcnt; i++)
        {
                if (TransactionIdEquals(xid, snapshot->xip[i]))
index a5d97697944a7d86ad438765e6544d7e0828e54a..e3f94edcea4735ea8792243aa877037788f5322e 100644 (file)
@@ -130,11 +130,13 @@ extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
 extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
                          ItemPointerData from,
                          Buffer newbuf, HeapTuple newtup);
+extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, 
+                         TransactionId latestRemovedXid);
 extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
                           OffsetNumber *redirected, int nredirected,
                           OffsetNumber *nowdead, int ndead,
                           OffsetNumber *nowunused, int nunused,
-                          bool redirect_move);
+                          TransactionId latestRemovedXid, bool redirect_move);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
                                TransactionId cutoff_xid,
                                OffsetNumber *offsets, int offcnt);
index 54264bdca4811d7e30eaaf06c1f596be744c57b9..96fb89d088b4046c2095e5c052472962a09094e8 100644 (file)
@@ -580,6 +580,7 @@ typedef HeapTupleData *HeapTuple;
 #define XLOG_HEAP2_FREEZE              0x00
 #define XLOG_HEAP2_CLEAN               0x10
 #define XLOG_HEAP2_CLEAN_MOVE  0x20
+#define XLOG_HEAP2_CLEANUP_INFO 0x30
 
 /*
  * All what we need to find changed tuple
@@ -668,6 +669,7 @@ typedef struct xl_heap_clean
 {
        RelFileNode node;
        BlockNumber block;
+       TransactionId   latestRemovedXid;
        uint16          nredirected;
        uint16          ndead;
        /* OFFSET NUMBERS FOLLOW */
@@ -675,6 +677,19 @@ typedef struct xl_heap_clean
 
 #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))
 
+/*
+ * Cleanup_info is required in some cases during a lazy VACUUM.
+ * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid()
+ * see vacuumlazy.c for full explanation
+ */
+typedef struct xl_heap_cleanup_info
+{
+       RelFileNode     node;
+       TransactionId   latestRemovedXid;
+} xl_heap_cleanup_info;
+
+#define SizeOfHeapCleanupInfo (sizeof(xl_heap_cleanup_info))
+
 /* This is for replacing a page's contents in toto */
 /* NB: this is used for indexes as well as heaps */
 typedef struct xl_heap_newpage
@@ -718,6 +733,9 @@ typedef struct xl_heap_freeze
 
 #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
 
+extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, 
+                                                                               TransactionId *latestRemovedXid);
+
 /* HeapTupleHeader functions implemented in utils/time/combocid.c */
 extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
 extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
index 2df34f54ee3ca72057bf95488f591913d9d54298..8028fce3568201c970aad229c7f6248527441b30 100644 (file)
@@ -214,12 +214,13 @@ typedef struct BTMetaPageData
 #define XLOG_BTREE_SPLIT_R             0x40    /* as above, new item on right */
 #define XLOG_BTREE_SPLIT_L_ROOT 0x50   /* add tuple with split of root */
 #define XLOG_BTREE_SPLI