Hot Standby patch 0.2.1 as submitted Sept 15

author Simon Riggs <[email protected]>

Fri, 25 Sep 2009 11:30:39 +0000 (12:30 +0100)

committer Simon Riggs <[email protected]>

Fri, 25 Sep 2009 11:30:39 +0000 (12:30 +0100)
author Simon Riggs <[email protected]>
Fri, 25 Sep 2009 11:30:39 +0000 (12:30 +0100)
committer Simon Riggs <[email protected]>
Fri, 25 Sep 2009 11:30:39 +0000 (12:30 +0100)
diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml

index b633e046350e2200067b741b1049f885c38f89b3..91917cf121aa753350bef2995dda7391cb677364 100644 (file)
--- a/doc/src/sgml/backup.sgml
+++ b/doc/src/sgml/backup.sgml
@@ -1883,6 +1883,688 @@ if (!triggered)
    </sect2>
   </sect1>
  
+ <sect1 id="hot-standby">
+  <title>Hot Standby</title>
+
+   <para>
+       Hot Standby is the term used to describe the ability to connect to
+       the server and run queries while the server is in archive recovery. This
+       is useful for both log shipping replication and for restoring a backup
+       to an exact state with great precision.
+       The term Hot Standby also refers to the ability of the server to move
+       from recovery through to normal running while users continue running
+       queries and/or continue their connections.
+   </para>
+
+   <para>
+       Running queries in recovery is in many ways the same as normal running
+       though there are a large number of usage and administrative points
+       to note.
+   </para>
+
+  <sect2 id="hot-standby-users">
+   <title>User's Overview</title>
+
+   <para>
+       Users can connect to the database while the server is in recovery
+       and perform read-only queries. Read-only access to catalogs and views
+       will also occur as normal.
+   </para>
+
+   <para>
+       The data on the standby takes some time to arrive from the primary server
+       so there will be a measurable delay between primary and standby.
+       Queries executed on the standby will be correct as of the data that had
+       been recovered at the start of the query (or start of first statement,
+       in the case of Serializable transactions). Running the same query nearly
+       simultaneously on both primary and standby might therefore return 
+       differing results.      We say that data on the standby is eventually
+       consistent with the primary.
+   </para>
+
+   <para>
+       When a connection is made in recovery, the parameter 
+       default_transaction_read_only will be forced to be true, whatever its
+       setting in postgresql.conf. As a result, all transactions started during
+       this time will be limited to read-only actions only. In all other ways,
+       connected sessions will appear identical to sessions initiated during
+       normal processing mode. There are no special commands required to
+       initiate a connection at this time, so all interfaces will work
+       normally without change.
+   </para>
+
+   <para>
+       Read-only here means "no writes to the permanent database tables". So
+       there are no problems with queries that make use of temporary sort and
+       work files will be used.  Temporary tables cannot be created and
+       therefore cannot be used at all in recovery mode.
+   </para>
+
+   <para>
+       The following actions are allowed
+
+       <itemizedlist>
+        <listitem>
+         <para>
+       Query access - SELECT, COPY TO including views and SELECT RULEs
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       Cursor commands - DECLARE, FETCH, CLOSE,
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       Parameters - SHOW, SET, RESET
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       Transaction management commands
+               <itemizedlist>
+                <listitem>
+                 <para>
+                  BEGIN, END, ABORT, START TRANSACTION
+             </para>
+            </listitem>
+                <listitem>
+                 <para>
+              SAVEPOINT, RELEASE, ROLLBACK TO SAVEPOINT
+             </para>
+            </listitem>
+                <listitem>
+                 <para>
+              EXCEPTION blocks and other internal subtransactions
+             </para>
+            </listitem>
+               </itemizedlist>
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       LOCK, with restrictions, see later
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       Plans and resources - PREPARE, EXECUTE, DEALLOCATE, DISCARD
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       Plugins and extensions - LOAD
+      </para>
+     </listitem>
+    </itemizedlist>
+   </para>
+
+   <para>
+       These actions will produce error messages
+
+       <itemizedlist>
+        <listitem>
+         <para>
+       DML - Insert, Update, Delete, COPY FROM, Truncate which all write data. 
+          Any RULE which generates DML will throw error messages as a result.
+          Note that there is no action possible that can result in a trigger
+          being executed.
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       DDL - Create, Drop, Alter, Comment (even for temporary tables because
+          currently these cause writes to catalog tables)
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       SELECT ... FOR SHARE | UPDATE which cause row locks to be written
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       Transaction management commands that explicitly set non-read only state
+               <itemizedlist>
+                <listitem>
+                 <para>
+                       BEGIN READ WRITE,
+                       START TRANSACTION READ WRITE
+             </para>
+            </listitem>
+                <listitem>
+                 <para>
+                       SET TRANSACTION READ WRITE,
+                       SET SESSION CHARACTERISTICS AS TRANSACTION READ WRITE
+             </para>
+            </listitem>
+                <listitem>
+                 <para>
+              SET transaction_read_only = off; or
+                  SET default_transaction_read_only = off;
+             </para>
+            </listitem>
+               </itemizedlist>
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       Two-phase commit commands - PREPARE TRANSACTION, COMMIT PREPARED,
+          ROLLBACK PREPARED because even read-only transactions need to write
+          WAL in the prepare phase (the first phase of two phase commit).
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+       sequence update - nextval()
+      </para>
+     </listitem>
+        <listitem>
+         <para>
+          LISTEN, UNLISTEN, NOTIFY since they currently write to system tables
+      </para>
+     </listitem>
+    </itemizedlist>
+   </para>
+    
+   <para>
+       Note that current behaviour of read only transactions when not in
+       recovery is to allow the last two actions, so there is a small and
+       subtle difference in behaviour between standby read-only transactions
+       and read only transactions during normal running.
+       It is possible that the restrictions on LISTEN, UNLISTEN, NOTIFY and
+       temporary tables may be lifted in a future release, if their internal
+       implementation is altered to make this possible.
+   </para>
+
+   <para>
+       If failover or switchover occurs the database will switch to normal
+       processing mode. Sessions will remain connected while the server
+       changes mode. Current transactions will continue, though will remain
+       read-only. After this, it will be possible to initiate read-write
+       transactions, though users must *manually* reset their 
+       default_transaction_read_only setting first, if they want that
+       behaviour.
+   </para>
+
+   <para>
+       Users will be able to tell whether their session is read-only by
+       issuing SHOW default_transaction_read_only.  In addition a set of
+       functions <xref linkend="functions-recovery-info-table"> allow users to
+       access information about Hot Standby. These allow you to write
+       functions that are aware of the current state of the database. These
+       can be used to monitor the progress of recovery, or to allow you to
+       write complex programs that restore the database to particular states.
+   </para>
+
+   <para>
+       In recovery, transactions will not be permitted to take any lock higher
+       other than AccessShareLock or AccessExclusiveLock. In addition,
+       transactions may never assign a TransactionId and may never write WAL.
+       The LOCK TABLE command by default applies an AccessExclusiveLock. 
+       Any LOCK TABLE command that runs on the standby and requests a specific
+       lock type other than AccessShareLock will be rejected.
+   </para>
+
+   <para>
+       During recovery database changes are applied using full MVCC rules.
+       In general this means that queries will not experience lock conflicts
+       with writes, just like normal Postgres concurrency control (MVCC). 
+   </para>
+  </sect2>
+
+  <sect2 id="hot-standby-conflict">
+   <title>Handling query conflicts</title>
+
+   <para>
+       There is some potential for conflict between standby queries
+       and WAL redo from the primary node. The user is provided with a number
+       of optional ways to handle these conflicts, though we must first 
+       understand the possible reasons behind a conflict.
+
+         <itemizedlist>
+          <listitem>
+           <para>
+                Access Exclusive Locks from primary node, including both explicit
+                LOCK commands and various kinds of DDL action
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+                Early cleanup of data still visible to the current query's snapshot
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+        Dropping tablespaces on the primary while standby queries are using
+                those tablespace for temporary work files (work_mem overflow)
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+        Dropping databases on the primary while that role is connected on standby.
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+        Waiting to acquire buffer cleanup locks (for which there is no time out) 
+           </para>
+          </listitem>
+         </itemizedlist>
+   </para>
+
+   <para>
+       Some WAL redo actions will be for DDL actions. These DDL actions are
+       repeating actions that have already committed on the primary node, so 
+       they must not fail on the standby node. These DDL locks take priority 
+       and will automatically *cancel* any read-only transactions that get in 
+       their way, after a grace period. This is similar to the possibility of
+       being canceled by the deadlock detector, but in this case the standby
+       process always wins, since the replayed actions must not fail. This
+       also ensures that replication doesn't fall behind while we wait for a
+       query to complete. Again, we assume that the standby is there for high
+       availability purposes primarily.
+   </para>
+
+   <para>
+       An example of the above would be an Administrator on Primary server
+       runs a DROP TABLE command that refers to a table currently in use by
+       a User query on the standby server.
+   </para>
+
+   <para>
+       Clearly the query cannot continue if we let the DROP TABLE proceed. If
+       this situation occurred on the primary, the DROP TABLE would wait until
+       the query has finished. When the query is on the standby and the 
+       DROP TABLE is on the primary, the primary doesn't have information about
+       what the standby is running and so does not wait on the primary. The
+       WAL change records come through to the standby while the query is still
+       running, causing a conflict.
+   </para>
+
+   <para>
+       The second reason for conflict between standby queries and WAL redo is 
+       "early cleanup". Normally, PostgreSQL allows cleanup of old row versions
+       when there are no users who may need to see them to ensure correct
+       visibility of data (known as MVCC). If there is a standby query that has
+       been running for longer than any query on the primary then it is possible
+       for old row versions to be removed by either VACUUM or HOT. This will 
+       then generate WAL records that, if applied, would remove data on the 
+       standby that might *potentially* be required by the standby query. 
+       In more technical language, the Primary's xmin horizon is later than 
+       the Standby's xmin horizon, allowing dead rows to be removed.
+   </para>
+
+   <para>
+       We have a number of choices for resolving query conflicts.  The default
+       is that we wait and hope the query completes. If the recovery is not paused,
+       then the server will wait automatically until the server the lag between
+       primary and standby is at most max_standby_delay seconds. Once that grace
+       period expires, we then take one of the following actions:
+
+         <itemizedlist>
+          <listitem>
+           <para>
+                If the conflict is caused by a lock, we cancel the standby transaction
+                immediately, even if it is idle-in-transaction.
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+                If the conflict is caused by cleanup records we tell the standby query
+                that a conflict has occurred and that it must cancel itself to avoid the
+                risk that it attempts to silently fails to read relevant data because
+                that data has been removed. (This is very similar to the much feared
+                error message "snapshot too old").
+           </para>
+
+           <para>
+                Note also that this means that idle-in-transaction sessions are never
+                canceled except by locks. Users should be clear that tables that are
+                regularly and heavily updated on primary server will quickly cause
+                cancellation of any longer running queries made against those tables.
+           </para>
+
+           <para>
+                If cancellation does occur, the query and/or transaction can always
+                be re-executed. The error is dynamic and will not necessarily occur
+                the same way if the query is executed again.
+           </para>
+          </listitem>
+         </itemizedlist>
+   </para>
+
+   <para>
+       Other remdial actions exist if the number of cancelations is unacceptable.
+       The first option is to connect to primary server and keep a query active
+       for as long as we need to run queries on the standby. This guarantees that
+       a WAL cleanup record is never generated and we don't ever get query
+       conflicts as described above. This could be done using contrib/dblink
+       and pg_sleep(), or via other mechanisms.
+   </para>
+
+   <para>
+       A second option is to pause recovery using recovery control functions. 
+       These can pause WAL apply completely and allows queries to proceed to
+       completion. We can issue pg_recovery_continue() at any time, so the pause
+       can be held for long or short periods, as the administrator allows. This
+       method of conflict resolution may mean that there is a build up of WAL
+       records waiting to be applied and this will progressively increase the
+       failover delay. If there is regular arrival of WAL records this would
+       quickly prevent the use of the standby as a high availability failover
+       target. Some users may wish to use multiple standby servers for various
+       purposes. Pauses in recovery stay until explicitly released, so that
+       pauses override the setting of max_standby_delay.
+   </para>
+
+   <para>
+       Note that max_standby_delay is set in recovery.conf. It applies to the
+       server as a whole, so once used it may not be available for other users.
+       They will have to wait for the server to catch up again before the grace
+       period is available again. So max_standby_delay is a configuration
+       parameter set by the administrator which controls the maximum acceptable
+       failover delay and is not a user-settable parameter to specify how long
+       their query needs to run in.
+   </para>
+
+   <para>
+       Waits for buffer cleanup locks do not currently result in query
+       cancelation. Long waits are uncommon, though can happen in some cases
+       with long running nested loop joins.
+   </para>
+
+   <para>
+       Dropping tablespaces or databases is discussed in the administrator's
+       section since they are not typical user situations.
+   </para>
+  </sect2>
+
+  <sect2 id="hot-standby-admin">
+   <title>Administrator's Overview</title>
+
+   <para>
+       If there is a recovery.conf file present then the will start in Hot Standby
+       mode by default, though this can be disabled by setting
+       "recovery_connections = off" in recovery.conf. The server may take some
+       time to enable recovery connections since the server must first complete
+       sufficient recovery to provide a consistent state against which queries
+       can run before enabling read only connections. Look for these messages
+       in the server logs
+
+<programlisting>
+LOG:  consistent recovery state reached
+LOG:  database system is ready to accept read only connections
+</programlisting>
+       
+       If you are running file-based log shipping ("warm standby"), you may need
+       to wait until the next WAL file arrives, which could be as long as the
+       archive_timeout setting on the primary. This is because consistency
+       information is recorded once per checkpoint on the primary. The
+       consistent state can also be delayed in the presence of both transactions
+       that contain large numbers of subtransactions and long-lived transactions.
+   </para>
+
+   <para>
+       The setting of max_connections on the standby should be equal to or
+       greater than the setting of max_connections on the primary. This is to
+       ensure that standby has sufficient resources to manage incoming
+       transactions.
+   </para>
+
+   <para>
+       It is important that the administrator consider the appropriate setting 
+       of "max_standby_delay", set in recovery,conf.  The default is 60 seconds, 
+       though there is no optimal setting and it should be set according to 
+       business priorities. For example if the server is primarily tasked as a 
+       High Availability server, then you may wish to lower max_standby_delay 
+       or even set it to zero. If the standby server is tasked as an additional
+       server for decision support queries then it may be acceptable to set this
+       to a value of many hours, e.g. max_standby_delay = 43200 (12 hours). It
+       is also possible to set max_standby_delay to -1 which means "always wait"
+       if there are conflicts, which will be useful when performing an archive
+       recovery from a backup.
+   </para>
+
+   <para>
+       A set of functions allow superusers to control the flow of recovery
+       are described in <xref linkend="functions-recovery-control-table">.
+       These functions allow you to pause and continue recovery, as well
+       as dynamically set new recovery targets wile recovery progresses.
+       Note that when a server is paused the apparent delay between primary
+       and standby will continue to increase.
+   </para>
+
+   <para>
+       Transaction status "hint bits" written on primary are not WAL-logged,
+       so data on standby will likely re-write the hints again on the standby.
+       Thus the main database blocks will produce write I/Os even though
+       all users are read-only; no changes have occurred to the data values
+       themselves.  Users will be able to write large sort temp files and
+       re-generate relcache info files, so there is no part of the database
+       that is truly read-only during hot standby mode. There is no restriction
+       on use of set returning functions, or other users of tuplestore/tuplesort
+       code. Note also that writes to remote databases will still be possible,
+       even though the transaction is read-only locally.
+   </para>
+
+   <para>
+       Failover can be initiated at any time by allowing the startup process to
+       reach the end of WAL, or by issuing the function pg_recovery_stop()
+       as superuser.
+   </para>
+
+   <para>
+       The following types of administrator command will not be accepted
+       during recovery mode
+
+         <itemizedlist>
+          <listitem>
+           <para>
+            Data Definition Language (DDL) - e.g. CREATE INDEX 
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+            Privilege and Ownership - GRANT, REVOKE, REASSIGN
+           </para>
+          </listitem>
+          <listitem>
+           <para>
+            Maintenance commands - ANALYZE, VACUUM, CLUSTER, REINDEX
+           </para>
+          </listitem>
+         </itemizedlist>
+   </para>
+
+   <para>
+       Note again that some of these commands are actually allowed during 
+       "read only" mode transactions on the primary. 
+   </para>
+
+   <para>
+       As a result, you cannot create additional indexes that exist solely 
+       on the standby, nor can statistics that exist solely on the standby.
+   </para>
+
+   <para>
+       pg_cancel_backend() will work on user backends, but not the Startup
+       process, which performs recovery. pg_locks will show locks held by
+       backends as normal. pg_locks also shows a virtual transaction
+       managed by the Startup process that owns all AccessExclusiveLocks held
+       by transactions being replayed by recovery. pg_stat_activity does not
+       show an entry for the Startup process, nor do recovering transactions
+       show as active.
+   </para>
+
+   <para>
+       check_pgsql will work, but it is very simple. check_postgres will also
+       work, though many some actions could give different or confusing results.
+       e.g. last vacuum time will not be maintained for example, since no
+       vacuum occurs on the standby (though vacuums running on the primary do
+       send their changes to the standby).
+   </para>
+
+   <para>
+       WAL file control commands will not work during recovery 
+       e.g. pg_start_backup(), pg_switch_xlog() etc..
+   </para>
+
+   <para>
+       Dynamically loadable modules work, including the pg_stat_statements.
+   </para>
+
+   <para>
+       Advisory locks work normally in recovery, including deadlock detection. 
+       Note that advisory locks are never WAL logged, so it is not possible for
+       an advisory lock on either the primary or the standby to conflict with WAL
+       replay. Nor is it possible to acquire an advisory lock on the primary
+       and have it initiate a similar advisory lock on the standby. Advisory
+       locks relate only to a single server on which they are acquired.
+   </para>
+
+   <para>
+       Trigger-based replication systems (Slony, Londiste, Bucardo etc) won't 
+       run on the standby at all, though they will run happily on the primary
+       server. WAL replay is not trigger-based so you cannot relay from the
+       standby to any system that requires additional database writes or
+       relies on the use of triggers. 
+   </para>
+
+   <para>
+       New oids cannot be assigned, though some UUID generators may still
+       work as long as they do not rely on writing new status to the database.
+   </para>
+
+   <para>
+       Currently, creating temp tables is not allowed during read only
+       transactions, so in some cases existing scripts will not run correctly.
+       It is possible we may relax that restriction in a later release. This is
+       both a SQL Standard compliance issue and a technical issue, so will not
+       be resolved in this release.
+   </para>
+
+   <para>
+       DROP TABLESPACE can only succeed if the tablespace is empty. Some standby
+       users may be actively using the tablespace via their temp_tablespaces
+       parameter. If there are temp files in the tablespace we currently
+       cancel all active queries to ensure that temp files are removed, so 
+       that we can remove the tablespace and continue with WAL replay.
+   </para>
+
+   <para>
+       Running DROP DATABASE, ALTER DATABASE SET TABLESPACE, or ALTER DATABASE
+       RENAME on primary will cause all users connected to that database on the
+       standby to be forcibly disconnected, once max_standby_delay has been
+       reached.
+   </para>
+
+   <para>
+       In normal running, if you issue DROP USER or DROP ROLE for a role with login
+       capability while that user is still connected then nothing happens to the
+       connected user - they remain connected. The user cannot reconnect however.
+       This behaviour applies in recovery also, so a DROP USER on the primary does
+       not disconnect that user on the standby.
+   </para>
+
+   <para>
+       Stats collector is active during recovery. All scans, reads, blocks,
+       index usage etc will all be recorded normally on the standby. Replayed
+       actions will not duplicate their effects on primary, so replaying an
+       insert will not increment the Inserts column of pg_stat_user_tables.
+       The stats file is deleted at start of recovery, so stats from primary
+       and standby will differ; this is considered a feature not a bug.
+   </para>
+
+   <para>
+       Autovacuum is not active during recovery, though will start normally
+       at the end of recovery.
+   </para>
+
+   <para>
+       Background writer is active during recovery and will perform
+       restartpoints (similar to checkpoints on primary) and normal block
+       cleaning activities. The CHECKPOINT command is accepted during recovery,
+       though performs a restartpoint rather than a new checkpoint.
+   </para>
+  </sect2>
+
+  <sect2 id="hot-standby-parameters">
+   <title>Hot Standby Parameter Reference</title>
+
+   <para>
+       The following additional parameters are supported/provided within the
+       <filename>recovery.conf</>.
+
+     <variablelist>
+
+     <varlistentry id="recovery-connections" xreflabel="recovery_connections">
+      <term><varname>recovery_connections</varname> (<type>boolean</type>)</term>
+      <listitem>
+       <para>
+               Specifies whether you would like to connect during recovery, or not.
+               The default is on, though you may wish to disable it to avoid
+               software problems, should they occur. Parameter can only be changed
+               be stopping and restarting the server.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="recovery-starts-paused" xreflabel="recovery_starts_paused">
+      <term><varname>recovery_starts_paused</varname> (<type>boolean</type>)</term>
+      <listitem>
+       <para>
+               Allows the Administrator to start recovery in paused mode. The default
+               is to start recovery so that it will continue processing all available
+               records.
+       </para>
+      </listitem>
+     </varlistentry>
+
+
+
+     <varlistentry id="max-standby-delay" xreflabel="max_standby_delay">
+      <term><varname>max_standby_delay</varname> (<type>string</type>)</term>
+      <listitem>
+       <para>
+               This parameter allows the Administrator to set a wait policy for
+               queries that conflict with incoming data changes. Valid settings
+               are -1, meaning wait forever, or a wait time of 0 or more seconds.
+               If a conflict should occur the server will delay up to this
+               amount before it begins trying to resolve things less amicably,
+               described in <xref linkend="hot-standby-conflict">. The 
+               <varname>max_standby_delay</varname> may be set at server start
+               or it may be dynamically adjusted using <function>pg_recovery_max_standby_delay</>
+               described in <xref linkend="functions-recovery-control-table">.
+       </para>
+      </listitem>
+     </varlistentry>
+
+   </variablelist>
+   </para>
+  </sect2>
+
+  <sect2 id="hot-standby-caveats">
+   <title>Caveats</title>
+
+   <para>
+    At this writing, there are several limitations of Hot Standby. 
+    These can and probably will be fixed in future releases:
+
+  <itemizedlist>
+   <listitem>
+    <para>
+     Operations on hash indexes are not presently WAL-logged, so
+     replay will not update these indexes.  Hash indexes will not be
+        available for use when running queries during recovery.
+    </para>
+   </listitem>
+  </itemizedlist>
+
+   </para>
+  </sect2>
+
+ </sect1>
+
   <sect1 id="migration">
    <title>Migration Between Releases</title>
  
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index b834ae411cb77f53ff6484aa23ef751bb387421b..576b51469a0460cc84e04914373d4cf68163dc88 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -370,6 +370,12 @@ SET ENABLE_SEQSCAN TO OFF;
          allows. See <xref linkend="sysvipc"> for information on how to
          adjust those parameters, if necessary.
         </para>
+
+       <para>
+        When running a standby server, it is strongly recommended that you
+        set this parameter to be the same or higher than the master server.
+        Otherwise, queries on the standby server may fail.
+       </para>
        </listitem>
       </varlistentry>
  
@@ -5519,6 +5525,32 @@ plruby.use_strict = true        # generates error: unknown class name
        </listitem>
       </varlistentry>
  
+     <varlistentry id="guc-trace-recovery-messages" xreflabel="trace_recovery_messages">
+      <term><varname>trace_recovery_messages</varname> (<type>string</type>)</term>
+      <indexterm>
+       <primary><varname>trace_recovery_messages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Controls which message levels are written to the server log
+        for system modules needed for recovery processing. This allows
+        the user to override the normal setting of log_min_messages,
+        but only for specific messages. This is intended for use in
+        debugging Hot Standby.
+        Valid values are <literal>DEBUG5</>, <literal>DEBUG4</>,
+        <literal>DEBUG3</>, <literal>DEBUG2</>, <literal>DEBUG1</>,
+        <literal>INFO</>, <literal>NOTICE</>, <literal>WARNING</>,
+        <literal>ERROR</>, <literal>LOG</>, <literal>FATAL</>, and
+        <literal>PANIC</>.  Each level includes all the levels that
+        follow it.  The later the level, the fewer messages are sent
+        to the log.  The default is <literal>WARNING</>.  Note that
+        <literal>LOG</> has a different rank here than in
+        <varname>client_min_messages</>.
+        Parameter should be set in the postgresql.conf only.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-zero-damaged-pages" xreflabel="zero_damaged_pages">
        <term><varname>zero_damaged_pages</varname> (<type>boolean</type>)</term>
        <indexterm>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index 62134e0eb4b2a00f3f94bf8f987dc0ad4ff602ab..6c22e8da684f42fba903f401233e36d37c434ebe 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -13093,6 +13093,254 @@ postgres=# SELECT * FROM pg_xlogfile_name_offset(pg_stop_backup());
      <xref linkend="continuous-archiving">.
     </para>
  
+   <indexterm>
+    <primary>pg_is_in_recovery</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xid</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xact_timestamp</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xlog_location</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_continue</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause_xid</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause_timestamp</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause_location</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_stop</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_max_standby_delay</primary>
+   </indexterm>
+
+   <para>
+    The functions shown in <xref
+    linkend="functions-recovery-info-table"> provide information 
+       about the current status of Hot Standby.
+    These functions may be executed during both recovery and in normal running.
+   </para>
+
+   <table id="functions-recovery-info-table">
+    <title>Recovery Information Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row><entry>Name</entry> <entry>Return Type</entry> <entry>Description</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry>
+        <literal><function>pg_is_in_recovery</function>()</literal>
+        </entry>
+       <entry><type>bool</type></entry>
+       <entry>True if recovery is still in progress. If you wish to 
+               know more detailed status information use <function>pg_current_recovery_target</>.
+          </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xid</function>()</literal>
+        </entry>
+       <entry><type>integer</type></entry>
+       <entry>Returns the transaction id (32-bit) of the last completed transaction
+        in the current recovery. Later numbered transaction ids may already have
+        completed, so the value could in some cases be lower than the last time
+        this function executed. If recovery has completed then the return value will
+        remain static at the value of the last transaction applied during that
+        recovery. When the server has been started normally without a recovery
+               then the return value will be InvalidXid (zero).
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xact_timestamp</function>()</literal>
+        </entry>
+       <entry><type>timestamp with time zone</type></entry>
+       <entry>Returns the original completion timestamp with timezone of the
+        last recovered transaction. If recovery is still in progress this
+        will increase monotonically, while if recovery has completed then this
+        value will remain static at the value of the last transaction applied
+        during that recovery. When the server has been started normally without
+               a recovery then the return value will be a default value.
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xlog_location</function>()</literal>
+        </entry>
+       <entry><type>text</type></entry>
+       <entry>Returns the transaction log location of the last recovered
+               transaction in the current recovery. This value is updated only
+               when transaction completion records (commit or abort) arrive, so
+               WAL records beyond this value may also have been recovered.
+               If recovery is still in progress this will increase monotonically. 
+               If recovery has completed then this value will remain static at the
+               value of the last WAL record applied during that recovery. When the
+               server has been started normally without a recovery then the return
+               value will be InvalidXLogRecPtr (0/0).
+       </entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    The functions shown in <xref
+    linkend="functions-recovery-control-table"> can be used to control archive recovery
+       when executed in Hot Standby mode.
+    These functions can only be executed during recovery. Their use is
+       restricted to superusers only.
+   </para>
+
+   <table id="functions-recovery-control-table">
+    <title>Recovery Control Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row><entry>Name</entry> <entry>Return Type</entry> <entry>Description</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Pause recovery processing, unconditionally.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_continue</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>If recovery is paused, continue processing.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_stop</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>End recovery and begin normal processing.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause_xid</function>(xid integer)</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Continue recovery until specified xid completes, if it is ever
+        seen, then pause recovery.
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause_timestamp</function>(endtime timestamp)</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Continue recovery until a transaction with specified timestamp
+        completes, if one is ever seen, then pause recovery.
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause_location</function>(location text)</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Continue recovery until a transaction with an LSN higher than
+               the specified WAL location completes, if one is ever seen, 
+               then pause recovery. The location is specified as a string of the
+               same form output by <function>pg_current_xlog_location()</function>,
+               e.g. pg_recovery_pause_location('0/D4445B8')
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_advance</function>(num_records integer)</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Advance recovery specified number of records then pause.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_current_recovery_target</function>()</literal>
+        </entry>
+       <entry><type>text</type></entry>
+       <entry>Returns details of the server's current recovery target, if any.
+               If recovery is paused then the return value is 'Recovery paused'.
+          </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_max_standby_delay</function>(delay integer)</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Set the max_standby_delay for recovery conflict processing (in seconds).</entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    <function>pg_recovery_pause</> and <function>pg_recovery_continue</> allow
+    a superuser to control the progress of recovery on the database server.
+       Once recovery is paused it will stay paused until you release it, even if 
+       the server falls further behind than max_standby_delay.
+       Recovery can be paused, continued, paused, continued, etc. as many times
+       as required. If the superuser wishes recovery to complete and normal
+       processing mode to start, execute <function>pg_recovery_stop</>.
+   </para>
+
+   <para>
+       The paused state provides a stable, unchanging database that can be 
+       queried to determine how far forwards recovery has progressed.  Recovery 
+       can never go backwards because previous data may have been overwritten, 
+       so some care must be taken to recover to a specific point. 
+       <function>pg_recovery_pause_xid</> and 
+       <function>pg_recovery_pause_timestamp</>, allow the specification of a trial
+    recovery target, similarly to <xref linkend="recovery-config-settings">.
+    Recovery will then progress to the specified point and then pause. This
+       allows the superuser to assess whether this is a desirable stopping point for
+       recovery, or a good place to copy data that is known to be deleted 
+       later in the recovery. <function>pg_recovery_pause_location</>
+       can also be used to pause recovery after a transaction completion record
+       arrives that has a higher LSN.
+   </para>
+
+   <para>
+    <function>pg_recovery_advance</> allows recovery to progress record by
+    record, for very careful analysis or debugging. Step size can be 1 or
+    more records. If recovery is not yet paused then
+    <function>pg_recovery_advance</> will process the specified number of
+    records then pause. If recovery is already paused, recovery will continue
+    for another N records before pausing again.
+   </para>
+
+   <para>
+    If you pause recovery while the server is waiting for a WAL file when
+    operating in standby mode it will have apparently no effect until the
+    file arrives. Once the server begins processing WAL records again it
+    will notice the pause request and will act upon it. This is not a bug.
+   </para>
+
+   <para>
+    You can see if recovery is paused by checking the process title, or
+       by using <function>pg_current_recovery_target</>.
+   </para>
+
     <para>
      The functions shown in <xref linkend="functions-admin-dbsize"> calculate
      the disk space usage of database objects.
diff --git a/doc/src/sgml/ref/checkpoint.sgml b/doc/src/sgml/ref/checkpoint.sgml

index d2992e44a0d0b6a02a157fbb3638a2089083d5eb..ee4a09fd30d773a5acd5a7a85f44e643a53e57cd 100644 (file)
--- a/doc/src/sgml/ref/checkpoint.sgml
+++ b/doc/src/sgml/ref/checkpoint.sgml
@@ -42,6 +42,11 @@ CHECKPOINT
     <xref linkend="wal"> for more information about the WAL system.
    </para>
  
+  <para>
+   If executed during recovery, the <command>CHECKPOINT</command> command
+   will force a restartpoint rather than writing a new checkpoint.
+  </para>
+
    <para>
     Only superusers can call <command>CHECKPOINT</command>.  The command is
     not intended for use during normal operation.
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c

index 1ebd15ea368a8276bb54add31751f2c25c6acafc..94b72024b7f8a98790495bc303d584259a040c54 100644 (file)
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -621,6 +621,12 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record)
  {
         uint8           info = record->xl_info & ~XLR_INFO_MASK;
  
+       /*
+        * GIN indexes do not require any conflict processing. XXX really?
+        */
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
         RestoreBkpBlocks(lsn, record, false);
  
         topCtx = MemoryContextSwitchTo(opCtx);
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c

index 4a20d905d4e4bb70c404444ec254bac8eeea5128..3e5f3b65e718b650c0617ccd7c26b25592d954cf 100644 (file)
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -396,6 +396,12 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
         uint8           info = record->xl_info & ~XLR_INFO_MASK;
         MemoryContext oldCxt;
  
+       /*
+        * GIST indexes do not require any conflict processing. XXX really?
+        */
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
         RestoreBkpBlocks(lsn, record, false);
  
         oldCxt = MemoryContextSwitchTo(opCtx);
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c

index 335f35e5b7ee732950cd7292f37d62e260e7e09e..bfb46dcc5decd7592037ce386d09e6ac3dec7ecc 100644 (file)
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -338,6 +338,17 @@ hashbeginscan(PG_FUNCTION_ARGS)
         IndexScanDesc scan;
         HashScanOpaque so;
  
+       /* 
+        * Hash indexes are not recoverable, so cannot ever be used
+        * during recovery mode. We try to avoid this by tweaking the 
+        * cost of hash index scans during recovery (see selfuncs.c),
+        * but we may still get called, so specifically prevent scans here.
+        * XXX We expect at some point to be able to exclude index scans on
+        * non-recoverable index types at the index AM level.
+        */
+       if (RecoveryInProgress())
+               elog(ERROR, "Cannot use hash indexes during recovery");
+
         scan = RelationGetIndexScan(rel, keysz, scankey);
         so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
         so->hashso_bucket_valid = false;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index b0a911e25975b63f3336fe52962c9405b2b51382..69a37b6e78758b92ae5a1a5f2e0ebde73560673d 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3769,6 +3769,61 @@ heap_restrpos(HeapScanDesc scan)
         }
  }
  
+/*
+ * Update the latestRemovedXid for the current VACUUM. This gets called
+ * only rarely, since we probably already removed rows earlier.
+ * see comments for vacuum_log_cleanup_info().
+ */
+void
+HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
+                                                                               TransactionId *latestRemovedXid)
+{
+       TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+       TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+       TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+       if (tuple->t_infomask & HEAP_MOVED_OFF ||
+               tuple->t_infomask & HEAP_MOVED_IN)
+       {
+               if (TransactionIdPrecedes(*latestRemovedXid, xvac))
+                       *latestRemovedXid = xvac;
+       }
+
+       if (TransactionIdPrecedes(*latestRemovedXid, xmax))
+               *latestRemovedXid = xmax;
+
+       if (TransactionIdPrecedes(*latestRemovedXid, xmin))
+               *latestRemovedXid = xmin;
+
+       Assert(TransactionIdIsValid(*latestRemovedXid));
+}
+
+/*
+ * Perform XLogInsert to register a heap cleanup info message. These
+ * messages are sent once per VACUUM and are required because
+ * of the phasing of removal operations during a lazy VACUUM.
+ * see comments for vacuum_log_cleanup_info().
+ */
+XLogRecPtr
+log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
+{
+       xl_heap_cleanup_info xlrec;
+       XLogRecPtr      recptr;
+       XLogRecData rdata;
+
+       xlrec.node = rnode;
+       xlrec.latestRemovedXid = latestRemovedXid;
+
+       rdata.data = (char *) &xlrec;
+       rdata.len = SizeOfHeapCleanupInfo;
+       rdata.buffer = InvalidBuffer;
+       rdata.next = NULL;
+
+       recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata);
+
+       return recptr;
+}
+
  /*
   * Perform XLogInsert for a heap-clean operation.  Caller must already
   * have modified the buffer and marked it dirty.
@@ -3776,13 +3831,17 @@ heap_restrpos(HeapScanDesc scan)
   * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
   * zero-based tuple indexes.  Now they are one-based like other uses
   * of OffsetNumber.
+ *
+ * For 8.5 we also include the latestRemovedXid which allows recovery
+ * processing to cancel long standby queries that would be have their
+ * results changed if we applied these changes.
   */
  XLogRecPtr
  log_heap_clean(Relation reln, Buffer buffer,
                            OffsetNumber *redirected, int nredirected,
                            OffsetNumber *nowdead, int ndead,
                            OffsetNumber *nowunused, int nunused,
-                          bool redirect_move)
+                          TransactionId latestRemovedXid, bool redirect_move)
  {
         xl_heap_clean xlrec;
         uint8           info;
@@ -3794,6 +3853,7 @@ log_heap_clean(Relation reln, Buffer buffer,
  
         xlrec.node = reln->rd_node;
         xlrec.block = BufferGetBlockNumber(buffer);
+       xlrec.latestRemovedXid = latestRemovedXid;
         xlrec.nredirected = nredirected;
         xlrec.ndead = ndead;
  
@@ -4067,6 +4127,35 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
         return recptr;
  }
  
+/*
+ * Handles CLEANUP_INFO
+ */
+static void
+heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
+
+       if (InHotStandby)
+       {
+               VirtualTransactionId *backends;
+
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+               backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
+                                                                                                       InvalidOid,
+                                                                                                       true);
+               ResolveRecoveryConflictWithVirtualXIDs(backends,
+                                                                                               "VACUUM index cleanup",
+                                                                                               CONFLICT_MODE_ERROR_DEFERRABLE,
+                                                                                               lsn);
+       }
+
+       /*
+        * Actual operation is a no-op. Record type exists to provide a means
+        * for conflict processing to occur before we begin index vacuum actions.
+        * see vacuumlazy.c
+        */
+}
+
  /*
   * Handles CLEAN and CLEAN_MOVE record types
   */
@@ -4085,12 +4174,29 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
         int                     nunused;
         Size            freespace;
  
+       if (InHotStandby)
+       {
+               VirtualTransactionId *backends;
+
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+               backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
+                                                                                                       InvalidOid,
+                                                                                                       true);
+               ResolveRecoveryConflictWithVirtualXIDs(backends,
+                                                                                               "VACUUM heap cleanup",
+                                                                                               CONFLICT_MODE_ERROR_DEFERRABLE,
+                                                                                               lsn);
+       }
+
+       RestoreBkpBlocks(lsn, record, true);
+
         if (record->xl_info & XLR_BKP_BLOCK_1)
                 return;
  
-       buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
         if (!BufferIsValid(buffer))
                 return;
+       LockBufferForCleanup(buffer);
         page = (Page) BufferGetPage(buffer);
  
         if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4145,12 +4251,21 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
         Buffer          buffer;
         Page            page;
  
+       /*
+        * Freezing tuples does not require conflict processing
+        */
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
+       RestoreBkpBlocks(lsn, record, false);
+
         if (record->xl_info & XLR_BKP_BLOCK_1)
                 return;
  
-       buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
         if (!BufferIsValid(buffer))
                 return;
+       LockBufferForCleanup(buffer);
         page = (Page) BufferGetPage(buffer);
  
         if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4740,6 +4855,13 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
  {
         uint8           info = record->xl_info & ~XLR_INFO_MASK;
  
+       /*
+        * Heap operations don't overwrite MVCC data so no conflict
+        * processing is required.
+        */
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
         RestoreBkpBlocks(lsn, record, false);
  
         switch (info & XLOG_HEAP_OPMASK)
@@ -4781,17 +4903,17 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
         switch (info & XLOG_HEAP_OPMASK)
         {
                 case XLOG_HEAP2_FREEZE:
-                       RestoreBkpBlocks(lsn, record, false);
                         heap_xlog_freeze(lsn, record);
                         break;
                 case XLOG_HEAP2_CLEAN:
-                       RestoreBkpBlocks(lsn, record, true);
                         heap_xlog_clean(lsn, record, false);
                         break;
                 case XLOG_HEAP2_CLEAN_MOVE:
-                       RestoreBkpBlocks(lsn, record, true);
                         heap_xlog_clean(lsn, record, true);
                         break;
+               case XLOG_HEAP2_CLEANUP_INFO:
+                       heap_xlog_cleanup_info(lsn, record);
+                       break;
                 default:
                         elog(PANIC, "heap2_redo: unknown op code %u", info);
         }
@@ -4921,17 +5043,26 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
         {
                 xl_heap_clean *xlrec = (xl_heap_clean *) rec;
  
-               appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+               appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
                                                  xlrec->node.spcNode, xlrec->node.dbNode,
-                                                xlrec->node.relNode, xlrec->block);
+                                                xlrec->node.relNode, xlrec->block,
+                                                xlrec->latestRemovedXid);
         }
         else if (info == XLOG_HEAP2_CLEAN_MOVE)
         {
                 xl_heap_clean *xlrec = (xl_heap_clean *) rec;
  
-               appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+               appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u remxid %u",
                                                  xlrec->node.spcNode, xlrec->node.dbNode,
-                                                xlrec->node.relNode, xlrec->block);
+                                                xlrec->node.relNode, xlrec->block,
+                                                xlrec->latestRemovedXid);
+       }
+       else if (info == XLOG_HEAP2_CLEANUP_INFO)
+       {
+               xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
+
+               appendStringInfo(buf, "cleanup info: remxid %u",
+                                                xlrec->latestRemovedXid);
         }
         else
                 appendStringInfo(buf, "UNKNOWN");
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c

index 0d5974f47e1117e4050f99f4e9416ca049ee10b9..4793929990cf63dc7ec800de3af86c4b00696c63 100644 (file)
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -30,7 +30,8 @@
  typedef struct
  {
         TransactionId new_prune_xid;    /* new prune hint value for page */
-       int                     nredirected;    /* numbers of entries in arrays below */
+       TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
+       int                     nredirected;            /* numbers of entries in arrays below */
         int                     ndead;
         int                     nunused;
         /* arrays that accumulate indexes of items to be changed */
@@ -84,6 +85,14 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
         if (!PageIsPrunable(page, OldestXmin))
                 return;
  
+       /*
+        * We can't write WAL in recovery mode, so there's no point trying to
+        * clean the page. The master will likely issue a cleaning WAL record
+        * soon anyway, so this is no particular loss.
+        */
+       if (RecoveryInProgress())
+               return;
+
         /*
          * We prune when a previous UPDATE failed to find enough space on the page
          * for a new tuple version, or when free space falls below the relation's
@@ -176,6 +185,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
          * of our working state.
          */
         prstate.new_prune_xid = InvalidTransactionId;
+       prstate.latestRemovedXid = InvalidTransactionId;
         prstate.nredirected = prstate.ndead = prstate.nunused = 0;
         memset(prstate.marked, 0, sizeof(prstate.marked));
  
@@ -257,7 +267,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
                                                                         prstate.redirected, prstate.nredirected,
                                                                         prstate.nowdead, prstate.ndead,
                                                                         prstate.nowunused, prstate.nunused,
-                                                                       redirect_move);
+                                                                       prstate.latestRemovedXid, redirect_move);
  
                         PageSetLSN(BufferGetPage(buffer), recptr);
                         PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
@@ -395,6 +405,8 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
                                 == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
                         {
                                 heap_prune_record_unused(prstate, rootoffnum);
+                               HeapTupleHeaderAdvanceLatestRemovedXid(htup,
+                                                                                                          &prstate->latestRemovedXid);
                                 ndeleted++;
                         }
  
@@ -520,7 +532,11 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
                  * find another DEAD tuple is a fairly unusual corner case.)
                  */
                 if (tupdead)
+               {
                         latestdead = offnum;
+                       HeapTupleHeaderAdvanceLatestRemovedXid(htup,
+                                                                                                  &prstate->latestRemovedXid);
+               }
                 else if (!recent_dead)
                         break;
  
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c

index 1c1cd34d2d68ee5fa4fea96677ba1879a1d37027..ec26f9bb5fb7197ea70f0879165b5ea86c48d2ff 100644 (file)
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -91,8 +91,19 @@ RelationGetIndexScan(Relation indexRelation,
         else
                 scan->keyData = NULL;
  
+       /*
+        * During recovery we ignore killed tuples and don't bother to kill them
+        * either. We do this because the xmin on the primary node could easily
+        * be later than the xmin on the standby node, so that what the primary
+        * thinks is killed is supposed to be visible on standby. So for correct
+        * MVCC for queries during recovery we must ignore these hints and check
+        * all tuples. Do *not* set ignore_killed_tuples to true when running
+        * in a transaction that was started during recovery. AMs can set it to
+        * false at any time. xactStartedInRecovery should not be touched by AMs.
+        */
         scan->kill_prior_tuple = false;
-       scan->ignore_killed_tuples = true;      /* default setting */
+       scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
+       scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
  
         scan->opaque = NULL;
  
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c

index c86cd52df84d00666237fb76212ef4b67922515d..69c9473cc603df4210b5caf3e0374560847b43e1 100644 (file)
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -455,9 +455,12 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
  
                         /*
                          * If we scanned a whole HOT chain and found only dead tuples,
-                        * tell index AM to kill its entry for that TID.
+                        * tell index AM to kill its entry for that TID. We do not do
+                        * this when in recovery because it may violate MVCC to do so.
+                        * see comments in RelationGetIndexScan().
                          */
-                       scan->kill_prior_tuple = scan->xs_hot_dead;
+                       if (!scan->xactStartedInRecovery)
+                               scan->kill_prior_tuple = scan->xs_hot_dead;
  
                         /*
                          * The AM's gettuple proc finds the next index entry matching the
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README

index 81d56b3a6b804b9cc978a6d6ebcdfcf79e70d5df..c58d2fcaf35fe86059d976aca4ef3f10acd1752f 100644 (file)
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -401,6 +401,27 @@ of the WAL entry.)  If the parent page becomes half-dead but is not
  immediately deleted due to a subsequent crash, there is no loss of
  consistency, and the empty page will be picked up by the next VACUUM.
  
+Scans during Recovery
+---------------------
+
+The btree index type can be safely used during recovery. During recovery
+we have at most one writer and potentially many readers. In that
+situation the locking requirements can be relaxed and we do not need
+double locking during block splits. Each WAL record makes changes to a
+single level of the btree using the correct locking sequence and so
+is safe for concurrent readers. Some readers may observe a block split
+in progress as they descend the tree, but they will simple move right
+onto the correct page.
+
+During recovery all index scans start with ignore_killed_tuples = false
+and we never set kill_prior_tuple. We do this because the oldest xmin
+on the standby server can be older than the oldest xmin on the master
+server, which means tuples can be marked as killed even when they are
+still visible on the standby. We don't WAL log tuple killed bits, but
+they can still appear in the standby because of full page writes. So
+we must always ignore them and that means it's not worth setting them
+either.
+
  Other Things That Are Handy to Know
  -----------------------------------
  
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c

index caa8928aa017a7e80d6bef5a078709809a2ef123..76384326ce9274dcbfffdb7d07966f8b9bdd7e11 100644 (file)
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -2024,7 +2024,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
         }
  
         if (ndeletable > 0)
-               _bt_delitems(rel, buffer, deletable, ndeletable);
+               _bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
  
         /*
          * Note: if we didn't find any LP_DEAD items, then the page's
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c

index 901b2b50d4d27aeb59c5a9300399df4b76f7d4ec..eefa888f6c1f5f4bcb5875402c040a3e31f07f97 100644 (file)
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -653,14 +653,24 @@ _bt_page_recyclable(Page page)
   *
   * This routine assumes that the caller has pinned and locked the buffer.
   * Also, the given itemnos *must* appear in increasing order in the array.
+ *
+ * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby
+ * we need to be able to pin all of the blocks in the btree in physical
+ * order when replaying the effects of a VACUUM, just as we do for the
+ * original VACUUM itself. lastBlockVacuumed allows us to tell whether an
+ * intermediate range of blocks has had no changes at all by VACUUM,
+ * and so must be scanned anyway during replay.
   */
  void
  _bt_delitems(Relation rel, Buffer buf,
-                        OffsetNumber *itemnos, int nitems)
+                        OffsetNumber *itemnos, int nitems, bool isVacuum,
+                        BlockNumber lastBlockVacuumed)
  {
         Page            page = BufferGetPage(buf);
         BTPageOpaque opaque;
  
+       Assert(isVacuum || lastBlockVacuumed == 0);
+
         /* No ereport(ERROR) until changes are logged */
         START_CRIT_SECTION();
  
@@ -688,15 +698,35 @@ _bt_delitems(Relation rel, Buffer buf,
         /* XLOG stuff */
         if (!rel->rd_istemp)
         {
-               xl_btree_delete xlrec;
                 XLogRecPtr      recptr;
                 XLogRecData rdata[2];
  
-               xlrec.node = rel->rd_node;
-               xlrec.block = BufferGetBlockNumber(buf);
+               if (isVacuum)
+               {
+                       xl_btree_vacuum xlrec_vacuum;
+                       xlrec_vacuum.node = rel->rd_node;
+                       xlrec_vacuum.block = BufferGetBlockNumber(buf);
+
+                       xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+                       rdata[0].data = (char *) &xlrec_vacuum;
+                       rdata[0].len = SizeOfBtreeVacuum;
+               }
+               else
+               {
+                       xl_btree_delete xlrec_delete;
+                       xlrec_delete.node = rel->rd_node;
+                       xlrec_delete.block = BufferGetBlockNumber(buf);
+
+                       /*
+                        * We would like to set an accurate latestRemovedXid, but there
+                        * is no easy way of obtaining a useful value. So we use the
+                        * probably far too conservative value of RecentGlobalXmin instead.
+                        */
+                       xlrec_delete.latestRemovedXid = InvalidTransactionId;
+                       rdata[0].data = (char *) &xlrec_delete;
+                       rdata[0].len = SizeOfBtreeDelete;
+               }
  
-               rdata[0].data = (char *) &xlrec;
-               rdata[0].len = SizeOfBtreeDelete;
                 rdata[0].buffer = InvalidBuffer;
                 rdata[0].next = &(rdata[1]);
  
@@ -719,7 +749,10 @@ _bt_delitems(Relation rel, Buffer buf,
                 rdata[1].buffer_std = true;
                 rdata[1].next = NULL;
  
-               recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
+               if (isVacuum)
+                       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
+               else
+                       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
  
                 PageSetLSN(page, recptr);
                 PageSetTLI(page, ThisTimeLineID);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index 79d4c660d669a23d3b7a399815d47eb1c9cf82da..525d8382e36689103d97be9d260eedc20e6ec1b6 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -57,7 +57,8 @@ typedef struct
         IndexBulkDeleteCallback callback;
         void       *callback_state;
         BTCycleId       cycleid;
-       BlockNumber lastUsedPage;
+       BlockNumber lastBlockVacuumed;  /* last blkno reached by Vacuum scan */
+       BlockNumber lastUsedPage;               /* blkno of last non-recyclable page */
         BlockNumber totFreePages;       /* true total # of free pages */
         MemoryContext pagedelcontext;
  } BTVacState;
@@ -629,6 +630,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
         vstate.callback = callback;
         vstate.callback_state = callback_state;
         vstate.cycleid = cycleid;
+       vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
         vstate.lastUsedPage = BTREE_METAPAGE;
         vstate.totFreePages = 0;
  
@@ -858,7 +860,19 @@ restart:
                  */
                 if (ndeletable > 0)
                 {
-                       _bt_delitems(rel, buf, deletable, ndeletable);
+                       BlockNumber     lastBlockVacuumed = BufferGetBlockNumber(buf);
+
+                       _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
+
+                       /*
+                        * Keep track of the block number of the lastBlockVacuumed, so
+                        * we can scan those blocks as well during WAL replay. This then
+                        * provides concurrency protection and allows btrees to be used
+                        * while in recovery.
+                        */
+                       if (lastBlockVacuumed > vstate->lastBlockVacuumed)
+                               vstate->lastBlockVacuumed = lastBlockVacuumed;
+
                         stats->tuples_removed += ndeletable;
                         /* must recompute maxoff */
                         maxoff = PageGetMaxOffsetNumber(page);
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c

index 895d6411323a268ee76bc0723cb201e59c948187..864c41c6be3176cdc9a24fc52dbade9445ab8ef2 100644 (file)
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -16,10 +16,14 @@
  
  #include "access/nbtree.h"
  #include "access/transam.h"
+#include "access/xact.h"
  #include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "utils/inval.h"
+#include "miscadmin.h"
  
  /*
- * We must keep track of expected insertions due to page splits, and apply
+ * We must keep track of expected insertions due to page spl   its, and apply
   * them manually if they are not seen in the WAL log during replay.  This
   * makes it safe for page insertion to be a multiple-WAL-action process.
   *
@@ -458,6 +462,86 @@ btree_xlog_split(bool onleft, bool isroot,
                                                  xlrec->leftsib, xlrec->rightsib, isroot);
  }
  
+static void
+btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_btree_vacuum *xlrec;
+       Buffer          buffer;
+       Page            page;
+       BTPageOpaque opaque;
+
+       if (record->xl_info & XLR_BKP_BLOCK_1)
+               return;
+
+       xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+
+       /*
+        * We need to ensure every block is unpinned between the
+        * lastBlockVacuumed and the current block, if there are any.
+        * This ensures that every block in the index is touched during
+        * VACUUM as required to ensure scans work correctly.
+        */
+       if ((xlrec->lastBlockVacuumed + 1) != xlrec->block)
+       {
+               BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+
+               for (; blkno < xlrec->block; blkno++)
+               {
+                       /*
+                        * XXX we don't actually need to read the block, we
+                        * just need to confirm it is unpinned. If we had a special call
+                        * into the buffer manager we could optimise this so that
+                        * if the block is not in shared_buffers we confirm it as unpinned.
+                        */
+                       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+                       if (BufferIsValid(buffer))
+                       {
+                               LockBufferForCleanup(buffer);
+                               UnlockReleaseBuffer(buffer);
+                       }
+               }
+       }
+
+       /*
+        * We need to take a cleanup lock to apply these changes.
+        * See nbtree/README for details.
+        */
+       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
+       if (!BufferIsValid(buffer))
+               return;
+       LockBufferForCleanup(buffer);
+       page = (Page) BufferGetPage(buffer);
+
+       if (XLByteLE(lsn, PageGetLSN(page)))
+       {
+               UnlockReleaseBuffer(buffer);
+               return;
+       }
+
+       if (record->xl_len > SizeOfBtreeVacuum)
+       {
+               OffsetNumber *unused;
+               OffsetNumber *unend;
+
+               unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum);
+               unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+               PageIndexMultiDelete(page, unused, unend - unused);
+       }
+
+       /*
+        * Mark the page as not containing any LP_DEAD items --- see comments in
+        * _bt_delitems().
+        */
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+       PageSetLSN(page, lsn);
+       PageSetTLI(page, ThisTimeLineID);
+       MarkBufferDirty(buffer);
+       UnlockReleaseBuffer(buffer);
+}
+
  static void
  btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
  {
@@ -470,6 +554,11 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
                 return;
  
         xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+       /*
+        * We don't need to take a cleanup lock to apply these changes.
+        * See nbtree/README for details.
+        */
         buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
         if (!BufferIsValid(buffer))
                 return;
@@ -714,6 +803,44 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
  {
         uint8           info = record->xl_info & ~XLR_INFO_MASK;
  
+       /*
+        * Btree delete records can conflict with standby queries. You might
+        * think that vacuum records would conflict as well, but they don't.
+        * XLOG_HEAP2_CLEANUP_INFO records provide the highest xid cleaned
+        * by the vacuum of the heap and so we can resolve any conflicts just
+        * once when that arrives. After that any we know that no conflicts exist
+        * from individual btree vacuum records on that index.
+        */
+       if (InHotStandby)
+       {
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+               if (info == XLOG_BTREE_DELETE)
+               {
+                       xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+                       VirtualTransactionId *backends;
+
+                       /*
+                        * XXX Currently we put everybody on death row, because
+                        * currently _bt_delitems() supplies InvalidTransactionId.
+                        * This can be fairly painful, so providing a better value
+                        * here is worth some thought and possibly some effort to
+                        * improve.
+                        */
+                       backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
+                                                                       InvalidOid,
+                                                                       true);
+
+                       ResolveRecoveryConflictWithVirtualXIDs(backends,
+                                                                       "drop tablespace",
+                                                                       CONFLICT_MODE_ERROR_DEFERRABLE,
+                                                                       lsn);
+               }
+       }
+
+       /*
+        * Exclusive lock on a btree block is as good as a Cleanup lock,
+        * so need to special case btree delete and vacuum.
+        */
         RestoreBkpBlocks(lsn, record, false);
  
         switch (info)
@@ -739,6 +866,9 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
                 case XLOG_BTREE_SPLIT_R_ROOT:
                         btree_xlog_split(false, true, lsn, record);
                         break;
+               case XLOG_BTREE_VACUUM:
+                       btree_xlog_vacuum(lsn, record);
+                       break;
                 case XLOG_BTREE_DELETE:
                         btree_xlog_delete(lsn, record);
                         break;
@@ -843,13 +973,24 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
                                                                  xlrec->level, xlrec->firstright);
                                 break;
                         }
+               case XLOG_BTREE_VACUUM:
+                       {
+                               xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
+
+                               appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
+                                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                                xlrec->node.relNode, xlrec->block,
+                                                                xlrec->lastBlockVacuumed);
+                               break;
+                       }
                 case XLOG_BTREE_DELETE:
                         {
                                 xl_btree_delete *xlrec = (xl_btree_delete *) rec;
  
-                               appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u",
+                               appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
                                                                  xlrec->node.spcNode, xlrec->node.dbNode,
-                                                                xlrec->node.relNode, xlrec->block);
+                                                                xlrec->node.relNode, xlrec->block,
+                                                                xlrec->latestRemovedXid);
                                 break;
                         }
                 case XLOG_BTREE_DELETE_PAGE:
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README

index a88563e3357d4a259592eefcce4d390c8063eb3d..fc7ecfd45dd0c4c2371a471208fefb49adfda386 100644 (file)
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -195,7 +195,8 @@ they first do something that requires one --- typically, insert/update/delete
  a tuple, though there are a few other places that need an XID assigned.
  If a subtransaction requires an XID, we always first assign one to its
  parent.  This maintains the invariant that child transactions have XIDs later
-than their parents, which is assumed in a number of places.
+than their parents, which is assumed in a number of places. In 8.5 onwards,
+some corner cases exist that require XID assignment to be WAL logged.
  
  The subsidiary actions of obtaining a lock on the XID and and entering it into
  pg_subtrans and PG_PROC are done at the time it is assigned.
@@ -649,3 +650,34 @@ fsync it down to disk without any sort of interlock, as soon as it finishes
  the bulk update.  However, all these paths are designed to write data that
  no other transaction can see until after T1 commits.  The situation is thus
  not different from ordinary WAL-logged updates.
+
+Transaction Emulation during Recovery
+-------------------------------------
+
+During Recovery we replay transaction changes in the order they occurred.
+As part of this replay we emulate some transactional behaviour, so that
+read only backends can take MVCC snapshots. We do this by maintaining a
+list of XIDs belonging to transactions that are being replayed, so that
+each transaction that has recorded WAL records for database writes exist
+in the array until it commits. Further details are given in comments in
+procarray.c.
+
+Many actions write no WAL records at all, for example read only transactions.
+These have no effect on MVCC in recovery and we can pretend they never
+occurred at all. Subtransaction commit does not write a WAL record either
+and has very little effect, since lock waiters need to wait for the
+parent transaction to complete.
+
+Not all transactional behaviour is emulated, for example we do not insert
+a transaction entry into the lock table, nor do we maintain the transaction
+stack in memory. Clog entries are made normally. Multitrans is not maintained
+because its purpose is to record tuple level locks that an application has
+requested to prevent write locks. Since write locks cannot be obtained at all,
+there is never any conflict and so there is no reason to update multitrans.
+Subtrans is maintained during recovery but the details of the transaction
+tree are ignored and all subtransactions reference the top-level TransactionId
+directly. Since commit is atomic this provides correct lock wait behaviour
+yet simplifies emulation of subtransactions considerably.
+
+Further details on locking mechanics in recovery are given in comments
+with the Lock rmgr code.
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c

index 8c1ccb69b3305b862bf6191d1c0447109ae4f057..521e41d10e72c0dd0239b7a23f229ee6ed0bc104 100644 (file)
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -35,6 +35,8 @@
  #include "access/clog.h"
  #include "access/slru.h"
  #include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlogutils.h"
  #include "pg_trace.h"
  #include "postmaster/bgwriter.h"
  
@@ -688,6 +690,9 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
         /* Backup blocks are not used in clog records */
         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
  
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
         if (info == CLOG_ZEROPAGE)
         {
                 int                     pageno;
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c

index 6f86961b887eb00a860c9916fa9e180e7448f804..2802cce6438de32bc22c833847def3ef30e62ef7 100644 (file)
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1413,8 +1413,11 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog)
   * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact.     Note that we
   * may already have replayed WAL data into the SLRU files.
   *
- * We don't need any locks here, really; the SLRU locks are taken
- * only because slru.c expects to be called with locks held.
+ * We want this operation to be atomic to ensure that other processes can
+ * use MultiXact while we complete recovery. We access one page only from the
+ * offset and members buffers, so once locks are acquired they will not be
+ * dropped and re-acquired by SLRU code. So we take both locks at start, then
+ * hold them all the way to the end.
   */
  void
  StartupMultiXact(void)
@@ -1426,6 +1429,7 @@ StartupMultiXact(void)
  
         /* Clean up offsets state */
         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
+       LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
  
         /*
          * Initialize our idea of the latest page number.
@@ -1452,10 +1456,7 @@ StartupMultiXact(void)
                 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
         }
  
-       LWLockRelease(MultiXactOffsetControlLock);
-
         /* And the same for members */
-       LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
  
         /*
          * Initialize our idea of the latest page number.
@@ -1483,6 +1484,7 @@ StartupMultiXact(void)
         }
  
         LWLockRelease(MultiXactMemberControlLock);
+       LWLockRelease(MultiXactOffsetControlLock);
  
         /*
          * Initialize lastTruncationPoint to invalid, ensuring that the first
@@ -1542,6 +1544,7 @@ CheckPointMultiXact(void)
          * isn't valid (because StartupMultiXact hasn't been called yet) and so
          * SimpleLruTruncate would get confused.  It seems best not to risk
          * removing any data during recovery anyway, so don't truncate.
+        * We are executing in the bgwriter, so we must access shared status.
          */
         if (!RecoveryInProgress())
                 TruncateMultiXact();
@@ -1873,6 +1876,9 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
         /* Backup blocks are not used in multixact records */
         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
  
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
         if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
         {
                 int                     pageno;
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c

index 0273b0e153c108fcc500e4f4f7bf1f9bb2086574..252f4ee3f82351549c4e33530525e039bd8905d4 100644 (file)
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -20,6 +20,7 @@
  #include "commands/dbcommands.h"
  #include "commands/sequence.h"
  #include "commands/tablespace.h"
+#include "storage/sinval.h"
  #include "storage/freespace.h"
  
  
@@ -32,7 +33,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
         {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
         {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
         {"Reserved 7", NULL, NULL, NULL, NULL, NULL},
-       {"Reserved 8", NULL, NULL, NULL, NULL, NULL},
+       {"Relation", relation_redo, relation_desc, NULL, NULL, NULL},
         {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
         {"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
         {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c

index 68e38696fb5bfe2f8437d127ff273c4cde77170b..3f890872a57f4c50cc97e629df0772291607f156 100644 (file)
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -598,7 +598,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
          * commands to set the commit status of transactions whose bits are in
          * already-truncated segments of the commit log (see notes in
          * SlruPhysicalWritePage).      Hence, if we are InRecovery, allow the case
-        * where the file doesn't exist, and return zeroes instead.
+        * where the file doesn't exist, and return zeroes instead. We also
+        * return a zeroed page when seek and read fails.
          */
         fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
         if (fd < 0)
@@ -619,6 +620,14 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
  
         if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
         {
+               if (InRecovery)
+               {
+                       ereport(LOG,
+                                       (errmsg("file \"%s\" doesn't exist, reading as zeroes",
+                                                       path)));
+                       MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+                       return true;
+               }
                 slru_errcause = SLRU_SEEK_FAILED;
                 slru_errno = errno;
                 close(fd);
@@ -628,6 +637,14 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
         errno = 0;
         if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
         {
+               if (InRecovery)
+               {
+                       ereport(LOG,
+                                       (errmsg("file \"%s\" doesn't exist, reading as zeroes",
+                                                       path)));
+                       MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+                       return true;
+               }
                 slru_errcause = SLRU_READ_FAILED;
                 slru_errno = errno;
                 close(fd);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c

index 0dbd2166be738393fa2eeb0bf46caa6555bc5100..e9b3fbc816a71ac8ab427e2d91836f2a6d2f1693 100644 (file)
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -31,6 +31,7 @@
  #include "access/slru.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
+#include "miscadmin.h"
  #include "pg_trace.h"
  #include "utils/snapmgr.h"
  
@@ -44,7 +45,8 @@
   * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at
   * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.  We need take no
   * explicit notice of that fact in this module, except when comparing segment
- * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes).
+ * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes)
+ * and in recovery when we do ExtendSUBTRANS.
   */
  
  /* We need four bytes per xact */
@@ -83,8 +85,12 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
         ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
         ptr += entryno;
  
-       /* Current state should be 0 */
-       Assert(*ptr == InvalidTransactionId);
+       /*
+        * Current state should be 0, except in recovery where we may
+        * need to reset the value multiple times
+        */
+       Assert(*ptr == InvalidTransactionId ||
+                       (InRecovery && *ptr == parent));
  
         *ptr = parent;
  
@@ -223,33 +229,19 @@ ZeroSUBTRANSPage(int pageno)
  /*
   * This must be called ONCE during postmaster or standalone-backend startup,
   * after StartupXLOG has initialized ShmemVariableCache->nextXid.
- *
- * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
- * if there are none.
   */
  void
  StartupSUBTRANS(TransactionId oldestActiveXID)
  {
-       int                     startPage;
-       int                     endPage;
+       TransactionId xid = ShmemVariableCache->nextXid;
+       int                     pageno = TransactionIdToPage(xid);
  
-       /*
-        * Since we don't expect pg_subtrans to be valid across crashes, we
-        * initialize the currently-active page(s) to zeroes during startup.
-        * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
-        * the new page without regard to whatever was previously on disk.
-        */
         LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
  
-       startPage = TransactionIdToPage(oldestActiveXID);
-       endPage = TransactionIdToPage(ShmemVariableCache->nextXid);
-
-       while (startPage != endPage)
-       {
-               (void) ZeroSUBTRANSPage(startPage);
-               startPage++;
-       }
-       (void) ZeroSUBTRANSPage(startPage);
+       /*
+        * Initialize our idea of the latest page number.
+        */
+       SubTransCtl->shared->latest_page_number = pageno;
  
         LWLockRelease(SubtransControlLock);
  }
@@ -302,16 +294,42 @@ void
  ExtendSUBTRANS(TransactionId newestXact)
  {
         int                     pageno;
+       static int last_pageno = 0;
  
-       /*
-        * No work except at first XID of a page.  But beware: just after
-        * wraparound, the first XID of page zero is FirstNormalTransactionId.
-        */
-       if (TransactionIdToEntry(newestXact) != 0 &&
-               !TransactionIdEquals(newestXact, FirstNormalTransactionId))
-               return;
+       Assert(TransactionIdIsNormal(newestXact));
  
-       pageno = TransactionIdToPage(newestXact);
+       if (!InRecovery)
+       {
+               /*
+                * No work except at first XID of a page.  But beware: just after
+                * wraparound, the first XID of page zero is FirstNormalTransactionId.
+                */
+               if (TransactionIdToEntry(newestXact) != 0 &&
+                       !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+                       return;
+
+               pageno = TransactionIdToPage(newestXact);
+       }
+       else
+       {
+               /*
+                * InRecovery we keep track of the last page we extended, so
+                * we can compare that against incoming XIDs. This will only
+                * ever be run by startup process, so keep it as a static variable
+                * rather than hiding behind the SubtransControlLock.
+                */
+               pageno = TransactionIdToPage(newestXact);
+
+               if (pageno == last_pageno ||
+                       SubTransPagePrecedes(pageno, last_pageno))
+                       return;
+
+               ereport(trace_recovery(DEBUG1),
+                               (errmsg("extend subtrans  xid %u page %d last_page %d",
+                                               newestXact, pageno, last_pageno)));
+
+               last_pageno = pageno;
+       }
  
         LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
  
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c

index 2a1eab4d16c0a9b3a899340aa456cf80e9a9531b..6fb2d3f7296c797c83b2d2eb3e61421fc8b86fc9 100644 (file)
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -35,9 +35,6 @@ static TransactionId cachedFetchXid = InvalidTransactionId;
  static XidStatus cachedFetchXidStatus;
  static XLogRecPtr cachedCommitLSN;
  
-/* Handy constant for an invalid xlog recptr */
-static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
-
  /* Local functions */
  static XidStatus TransactionLogFetch(TransactionId transactionId);
  
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c

index 195c90c794ba5d7bd7ff01e0d0fc3891cfb6af9a..54c9ff4006bf8df9de8de6ecb66002ecf448c41d 100644 (file)
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1694,6 +1694,34 @@ RecoverPreparedTransactions(void)
         FreeDir(cldir);
  }
  
+void
+ProcessTwoPhaseStandbyRecords(TransactionId xid)
+{
+       char       *buf;
+       char       *bufptr;
+       TwoPhaseFileHeader *hdr;
+
+       /* Read and validate file, if possible */
+       buf = ReadTwoPhaseFile(xid);
+       if (buf != NULL)
+       {
+               /* Deconstruct header */
+               hdr = (TwoPhaseFileHeader *) buf;
+               Assert(TransactionIdEquals(hdr->xid, xid));
+               bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
+               bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
+               bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
+               bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
+
+               /*
+                * Recover other state using resource managers
+                */
+               ProcessRecords(bufptr, xid, twophase_postcommit_standby_callbacks);
+
+               pfree(buf);
+       }
+}
+
  /*
   *     RecordTransactionCommitPrepared
   *
@@ -1723,8 +1751,11 @@ RecordTransactionCommitPrepared(TransactionId xid,
         /* Emit the XLOG commit record */
         xlrec.xid = xid;
         xlrec.crec.xact_time = GetCurrentTimestamp();
+       xlrec.crec.xinfo = 0;
+       xlrec.crec.nmsgs = 0;
         xlrec.crec.nrels = nrels;
         xlrec.crec.nsubxacts = nchildren;
+
         rdata[0].data = (char *) (&xlrec);
         rdata[0].len = MinSizeOfXactCommitPrepared;
         rdata[0].buffer = InvalidBuffer;
diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c

index 4ff95499891f863a21bddd066710e81e69746b78..77d3395e7cc92e75a36885ff76606a5b4286a831 100644 (file)
--- a/src/backend/access/transam/twophase_rmgr.c
+++ b/src/backend/access/transam/twophase_rmgr.c
@@ -20,6 +20,14 @@
  #include "storage/lock.h"
  #include "utils/inval.h"
  
+const TwoPhaseCallback twophase_postcommit_standby_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+       NULL,                                           /* END ID */
+       NULL,                                           /* Lock - see notes in xact_redo_commit() */
+       inval_twophase_postcommit,      /* Inval - see notes in xact_redo_commit() */
+       NULL,                                           /* notify/listen doesn't work in recovery */
+       NULL                                            /* pgstat doesn't record recovery work */
+};
  
  const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
  {
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index c3606e09860be62fe5f3a4b0849f0f65231fe71a..8c45738f147a7528516cd553ba1946b674358105 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -40,6 +40,7 @@
  #include "storage/fd.h"
  #include "storage/lmgr.h"
  #include "storage/procarray.h"
+#include "storage/sinval.h"
  #include "storage/sinvaladt.h"
  #include "storage/smgr.h"
  #include "utils/combocid.h"
@@ -135,10 +136,13 @@ typedef struct TransactionStateData
         ResourceOwner curTransactionOwner;      /* my query resources */
         TransactionId *childXids;       /* subcommitted child XIDs, in XID order */
         int                     nChildXids;             /* # of subcommitted child XIDs */
+       int                     nReportedChildXids;
         int                     maxChildXids;   /* allocated size of childXids[] */
         Oid                     prevUser;               /* previous CurrentUserId setting */
         bool            prevSecDefCxt;  /* previous SecurityDefinerContext setting */
         bool            prevXactReadOnly;               /* entry-time xact r/o state */
+       bool            startedInRecovery;      /* did we start in recovery? */
+       bool            reportedXid;
         struct TransactionStateData *parent;            /* back link to parent */
  } TransactionStateData;
  
@@ -163,10 +167,13 @@ static TransactionStateData TopTransactionStateData = {
         NULL,                                           /* cur transaction resource owner */
         NULL,                                           /* subcommitted child Xids */
         0,                                                      /* # of subcommitted child Xids */
+       0,                                                      /* # of reported child Xids */
         0,                                                      /* allocated size of childXids[] */
         InvalidOid,                                     /* previous CurrentUserId setting */
         false,                                          /* previous SecurityDefinerContext setting */
         false,                                          /* entry-time xact r/o state */
+       false,                                          /* startedInRecovery */
+       false,                                          /* reportedXid */
         NULL                                            /* link to parent state block */
  };
  
@@ -209,6 +216,11 @@ static bool forceSyncCommit = false;
   */
  static MemoryContext TransactionAbortContext = NULL;
  
+/*
+ * Local state to optimise recovery conflict resolution
+ */
+static TransactionId   latestRemovedXid = InvalidTransactionId;
+
  /*
   * List of add-on start- and end-of-xact callbacks
   */
@@ -274,6 +286,9 @@ static const char *BlockStateAsString(TBlockState blockState);
  static const char *TransStateAsString(TransState state);
  
  
+static TransactionId *xactGetUnreportedChildren(int threshold, int *nxids);
+static TransactionId *xactCollectUnreportedChildren(TransactionState s, TransactionId *xids);
+
  /* ----------------------------------------------------------------
   *     transaction state accessors
   * ----------------------------------------------------------------
@@ -392,6 +407,9 @@ AssignTransactionId(TransactionState s)
         bool            isSubXact = (s->parent != NULL);
         ResourceOwner currentOwner;
  
+       if (RecoveryInProgress())
+               elog(ERROR, "cannot assign TransactionIds during recovery");
+
         /* Assert that caller didn't screw up */
         Assert(!TransactionIdIsValid(s->transactionId));
         Assert(s->state == TRANS_INPROGRESS);
@@ -435,8 +453,54 @@ AssignTransactionId(TransactionState s)
         }
         PG_END_TRY();
         CurrentResourceOwner = currentOwner;
-}
  
+       /*
+        * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each
+        * top-level transaction we issue a WAL record for the assignment. We
+        * include the top-level xid and all the subxids that have not yet been
+        * reported using XLOG_XACT_ASSIGNMENT records.
+        *
+        * This required to limit ensure snaphots taken during recovery do not
+        * overflow. See notes for RecordKnownAssignedTransactionIds().
+        *
+        * We don't actually keep track of the immediate parent of each subxid,
+        * only the top-level transaction that each subxact belongs to. This
+        * is correct in recovery only because aborted subtransactions are
+        * separately WAL logged.
+        */
+       if (isSubXact && XLogArchivingActive())
+       {
+               int nchildren;
+               TransactionId *children;
+               children = xactGetUnreportedChildren(PGPROC_MAX_CACHED_SUBXIDS, &nchildren);
+
+               if (children != NULL)
+               {
+                       XLogRecData rdata[2];
+                       xl_xact_assignment      xlrec;
+
+                       /*
+                        * We say "IfAny" to avoid recursion again here.
+                        */
+                       xlrec.xtop = GetTopTransactionIdIfAny();
+                       Assert(TransactionIdIsValid(xlrec.xtop));
+
+                       xlrec.nsubxacts = nchildren;
+                       
+                       rdata[0].data = (char *) (&xlrec);
+                       rdata[0].len = MinSizeOfXactAssignment;
+                       rdata[0].buffer = InvalidBuffer;
+                       rdata[0].next = &rdata[1];
+
+                       rdata[1].data = (char *) children;
+                       rdata[1].len = sizeof(TransactionId) * nchildren;
+                       rdata[1].buffer = InvalidBuffer;
+                       rdata[1].next = NULL;
+
+                       (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, rdata);
+               }
+       }
+}
  
  /*
   *     GetCurrentSubTransactionId
@@ -596,6 +660,16 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
         return false;
  }
  
+/*
+ *     TransactionStartedDuringRecovery, used during index scans
+ */
+bool
+TransactionStartedDuringRecovery(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       return s->startedInRecovery;
+}
  
  /*
   *     CommandCounterIncrement
@@ -811,7 +885,7 @@ AtSubStart_ResourceOwner(void)
   * This is exported only to support an ugly hack in VACUUM FULL.
   */
  TransactionId
-RecordTransactionCommit(void)
+RecordTransactionCommit(bool isVacuumFull)
  {
         TransactionId xid = GetTopTransactionIdIfAny();
         bool            markXidCommitted = TransactionIdIsValid(xid);
@@ -821,11 +895,15 @@ RecordTransactionCommit(void)
         bool            haveNonTemp;
         int                     nchildren;
         TransactionId *children;
+       int                     nmsgs;
+       SharedInvalidationMessage *invalMessages = NULL;
+       bool            RelcacheInitFileInval;
  
         /* Get data needed for commit record */
         nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
         nchildren = xactGetCommittedChildren(&children);
-
+       nmsgs = xactGetCommittedInvalidationMessages(&invalMessages,
+                                                                                                &RelcacheInitFileInval);
         /*
          * If we haven't been assigned an XID yet, we neither can, nor do we want
          * to write a COMMIT record.
@@ -859,13 +937,24 @@ RecordTransactionCommit(void)
                 /*
                  * Begin commit critical section and insert the commit XLOG record.
                  */
-               XLogRecData rdata[3];
+               XLogRecData rdata[4];
                 int                     lastrdata = 0;
                 xl_xact_commit xlrec;
  
                 /* Tell bufmgr and smgr to prepare for commit */
                 BufmgrCommit();
  
+               /*
+                * Set flags required for recovery processing of commits.
+                * Nothing too important here that we would want to include this
+                * within the critical section following.
+                */
+               xlrec.xinfo = 0;
+               if (RelcacheInitFileInval)
+                       xlrec.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE;
+               if (isVacuumFull)
+                       xlrec.xinfo |= XACT_COMPLETION_VACUUM_FULL;
+
                 /*
                  * Mark ourselves as within our "commit critical section".      This
                  * forces any concurrent checkpoint to wait until we've updated
@@ -890,6 +979,8 @@ RecordTransactionCommit(void)
                 xlrec.xact_time = xactStopTimestamp;
                 xlrec.nrels = nrels;
                 xlrec.nsubxacts = nchildren;
+               xlrec.nmsgs = nmsgs;
+
                 rdata[0].data = (char *) (&xlrec);
                 rdata[0].len = MinSizeOfXactCommit;
                 rdata[0].buffer = InvalidBuffer;
@@ -911,6 +1002,15 @@ RecordTransactionCommit(void)
                         rdata[2].buffer = InvalidBuffer;
                         lastrdata = 2;
                 }
+               /* dump shared cache invalidation messages */
+               if (nmsgs > 0)
+               {
+                       rdata[lastrdata].next = &(rdata[3]);
+                       rdata[3].data = (char *) invalMessages;
+                       rdata[3].len = nmsgs * sizeof(SharedInvalidationMessage);
+                       rdata[3].buffer = InvalidBuffer;
+                       lastrdata = 3;
+               }
                 rdata[lastrdata].next = NULL;
  
                 (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
@@ -1144,6 +1244,8 @@ AtSubCommit_childXids(void)
         s->childXids = NULL;
         s->nChildXids = 0;
         s->maxChildXids = 0;
+       s->nReportedChildXids = 0;
+       s->reportedXid = false;
  }
  
  /* ----------------------------------------------------------------
@@ -1352,6 +1454,8 @@ AtSubAbort_childXids(void)
         s->childXids = NULL;
         s->nChildXids = 0;
         s->maxChildXids = 0;
+       s->nReportedChildXids = 0;
+       s->reportedXid = false;
  }
  
  /* ----------------------------------------------------------------
@@ -1521,7 +1625,10 @@ StartTransaction(void)
         s->gucNestLevel = 1;
         s->childXids = NULL;
         s->nChildXids = 0;
+       s->nReportedChildXids = 0;
+       s->reportedXid = false;
         s->maxChildXids = 0;
+       s->startedInRecovery = RecoveryInProgress();
         GetUserIdAndContext(&s->prevUser, &s->prevSecDefCxt);
         /* SecurityDefinerContext should never be set outside a transaction */
         Assert(!s->prevSecDefCxt);
@@ -1619,7 +1726,7 @@ CommitTransaction(void)
         /*
          * Here is where we really truly commit.
          */
-       latestXid = RecordTransactionCommit();
+       latestXid = RecordTransactionCommit(false);
  
         TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
  
@@ -1717,6 +1824,8 @@ CommitTransaction(void)
         s->childXids = NULL;
         s->nChildXids = 0;
         s->maxChildXids = 0;
+       s->nReportedChildXids = 0;
+       s->reportedXid = false;
  
         /*
          * done with commit processing, set current transaction state back to
@@ -1950,6 +2059,8 @@ PrepareTransaction(void)
         s->childXids = NULL;
         s->nChildXids = 0;
         s->maxChildXids = 0;
+       s->nReportedChildXids = 0;
+       s->reportedXid = false;
  
         /*
          * done with 1st phase commit processing, set current transaction state
@@ -2120,6 +2231,8 @@ CleanupTransaction(void)
         s->childXids = NULL;
         s->nChildXids = 0;
         s->maxChildXids = 0;
+       s->nReportedChildXids = 0;
+       s->reportedXid = false;
  
         /*
          * done with abort processing, set current transaction state back to
@@ -4194,33 +4307,335 @@ xactGetCommittedChildren(TransactionId **ptr)
         return s->nChildXids;
  }
  
+static TransactionId *
+xactGetUnreportedChildren(int threshold, int *nxids)
+{
+       TransactionState s;
+       int nTotalUnreportedXids = 0;
+       TransactionId *xids;
+
+       /* Count unreported xids in the tree */
+       for (s = CurrentTransactionState; s != NULL; s = s->parent)
+       {
+               if (!s->reportedXid)
+                       nTotalUnreportedXids++;
+               nTotalUnreportedXids += s->nChildXids - s->nReportedChildXids;
+               if (s->reportedXid)
+                       break;
+       }
+
+       *nxids = nTotalUnreportedXids;
+
+       if (nTotalUnreportedXids < threshold)
+               return NULL;
+
+       xids = (TransactionId *) palloc(sizeof(TransactionId) * nTotalUnreportedXids);
+       xactCollectUnreportedChildren(CurrentTransactionState, xids);
+       return xids;
+}
+
+/* Helper function for xactGetUnreportedChildren */
+static TransactionId *
+xactCollectUnreportedChildren(TransactionState s, TransactionId *xids)
+{
+       int nUnreportedChildXids;
+
+       if (s->parent != NULL)
+       {
+               xids = xactCollectUnreportedChildren(s->parent, xids);
+               if (!s->reportedXid)
+               {
+                       s->reportedXid = true;
+                       *(xids++) = s->transactionId;
+               }
+       }
+
+       nUnreportedChildXids = s->nChildXids - s->nReportedChildXids;
+       memcpy(xids, &s->childXids[s->nReportedChildXids],
+                  nUnreportedChildXids * sizeof(TransactionId));
+       xids += nUnreportedChildXids;
+
+       s->nReportedChildXids = s->nChildXids;
+
+       return xids;
+}
+
+/*
+ * Record an enhanced snapshot of running transactions into WAL.
+ *
+ * The definitions of RunningTransactionData and xl_xact_running_xacts
+ * are similar. We keep them separate because xl_xact_running_xacts
+ * is a contiguous chunk of memory and never exists fully until it is
+ * assembled in WAL. 
+ */
+XLogRecPtr
+LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
+{
+       xl_xact_running_xacts   xlrec;
+       XLogRecData                     rdata[4];
+       int                                             lastrdata = 0;
+
+       xlrec.xcnt = CurrRunningXacts->xcnt;
+       xlrec.subxcnt = CurrRunningXacts->subxcnt;
+       xlrec.numLocks = CurrRunningXacts->numLocks;
+       xlrec.lock_overflow = CurrRunningXacts->lock_overflow;
+       xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
+       xlrec.latestRunningXid = CurrRunningXacts->latestRunningXid;
+       xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
+       xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+       /* Header */
+       rdata[0].data = (char *) (&xlrec);
+       rdata[0].len = MinSizeOfXactRunningXacts;
+       rdata[0].buffer = InvalidBuffer;
+
+       /* array of RunningXact */
+       if (xlrec.xcnt > 0)
+       {
+               rdata[0].next = &(rdata[1]);
+               rdata[1].data = (char *) CurrRunningXacts->xrun;
+               rdata[1].len = xlrec.xcnt * sizeof(RunningXact);
+               rdata[1].buffer = InvalidBuffer;
+               lastrdata = 1;
+       }
+
+       /* array of TransactionIds */
+       if (xlrec.subxcnt > 0)
+       {
+               rdata[lastrdata].next = &(rdata[2]);
+               rdata[2].data = (char *) CurrRunningXacts->subxip;
+               rdata[2].len = xlrec.subxcnt * sizeof(TransactionId);
+               rdata[2].buffer = InvalidBuffer;
+               lastrdata = 2;
+       }
+
+       /* array of Locks */
+       if (xlrec.numLocks > 0)
+       {
+               rdata[lastrdata].next = &(rdata[3]);
+               rdata[3].data = (char *) CurrRunningXacts->loggableLocks;
+               rdata[3].len = xlrec.numLocks * sizeof(xl_rel_lock);
+               rdata[3].buffer = InvalidBuffer;
+               lastrdata = 3;
+       }
+
+       rdata[lastrdata].next = NULL;
+
+       return XLogInsert(RM_XACT_ID, XLOG_XACT_RUNNING_XACTS, rdata);
+}
+
+/*
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make lock table
+ * inserts to appear like a transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+void
+InitRecoveryTransactionEnvironment(void)
+{
+       VirtualTransactionId vxid;
+
+       /*
+        * Initialise shared invalidation management for Startup process,
+        * being careful to register ourselves as a sendOnly process so
+        * we don't need to read messages, nor will we get signalled
+        * when the queue starts filling up.
+        */
+       SharedInvalBackendInit(true);
+
+       /*
+        * Record the PID and PGPROC structure of the startup process.
+        */
+       PublishStartupProcessInformation();
+
+       /*
+        * Lock a virtual transaction id for Startup process.
+        *
+        * We need to do GetNextLocalTransactionId() because
+        * SharedInvalBackendInit() leaves localTransactionid invalid and
+        * the lock manager doesn't like that at all.
+        *
+        * Note that we don't need to run XactLockTableInsert() because nobody
+        * needs to wait on xids. That sounds a little strange, but table locks
+        * are held by vxids and row level locks are held by xids. All queries
+        * hold AccessShareLocks so never block while we write or lock new rows.
+        */
+       vxid.backendId = MyBackendId;
+       vxid.localTransactionId = GetNextLocalTransactionId();
+       VirtualXactLockTableInsert(vxid);
+}
+
+void
+XactClearRecoveryTransactions(void)
+{
+       /*
+        * Remove entries from shared data structures
+        */
+       ExpireAllKnownAssignedTransactionIds();
+       RelationReleaseAllRecoveryLocks();
+}
+
+/*
+ * LatestRemovedXidAdvances - returns true if latestRemovedXid is moved
+ *                                                             forwards by the latest provided value
+ */
+bool
+LatestRemovedXidAdvances(TransactionId latestXid)
+{
+       /*
+        * Don't bother checking for conflicts for cleanup records earlier than
+        * we have already tested for.
+        */
+       if (!TransactionIdIsValid(latestRemovedXid) ||
+               (TransactionIdIsValid(latestRemovedXid) &&
+               TransactionIdPrecedes(latestRemovedXid, latestXid)))
+       {
+               latestRemovedXid = latestXid;
+               return true;
+       }
+
+       return false;
+}
+
  /*
   *     XLOG support routines
   */
  
+/*
+ * Before 8.5 this was a fairly short function, but now it performs many
+ * actions for which the order of execution is critical.
+ */
  static void
-xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
+xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, 
+                                       XLogRecPtr lsn, bool preparedXact)
  {
         TransactionId *sub_xids;
         TransactionId max_xid;
         int                     i;
  
-       /* Mark the transaction committed in pg_clog */
         sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-       TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
  
-       /* Make sure nextXid is beyond any XID mentioned in the record */
-       max_xid = xid;
-       for (i = 0; i < xlrec->nsubxacts; i++)
+       max_xid = TransactionIdLatest(xid, xlrec->nsubxacts, sub_xids);
+
+       /* 
+        * Mark the transaction committed in pg_clog.
+        */
+       if (!InHotStandby)
+               TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
+       else
         {
-               if (TransactionIdPrecedes(max_xid, sub_xids[i]))
-                       max_xid = sub_xids[i];
+               /*
+                * Just when you thought it was safe to go swimming again,
+                * along comes a nasty hack with bells on. Half way through
+                * VACUUM FULL it emits a false commit record, so it ends up
+                * emitting two commit records with the same xid. Oh, and it
+                * musn't release locks at the first commit either. So we
+                * have to specially mark the commit record "ignore me".
+                * On primary it actually marks clog committed yet stays
+                * visible in procarray. Cthulhu fhtagn. Run away screaming.
+                */
+               if (XactCompletionVacuumFull(xlrec))
+               {
+                       elog(trace_recovery(DEBUG4), "skipping VACUUM FULL pseudo-commit %u", xid);
+                       return;
+               }
+
+               /*
+                * Record any newly known assigned transactions. This looks
+                * strange to add xids and then immediately remove them, but
+                * we do other important processing here also do don't remove
+                * them (again).
+                */
+               RecordKnownAssignedTransactionIds(max_xid);
+
+               /* 
+                * Mark the transaction committed in pg_clog. We use async commit
+                * protocol during recovery to provide information on database
+                * consistency for when users try to set hint bits. It is important
+                * that we do not set hint bits until the minRecoveryPoint is past
+                * this commit record. This ensures that if we crash we don't see
+                * hint bits set on changes made by transactions that haven't yet
+                * recovered. It's unlikely but it's good to be safe.
+                */
+               TransactionIdAsyncCommitTree(xid, xlrec->nsubxacts, sub_xids, lsn);
+
+               /*
+                * We must mark clog before we update the ProcArray.
+                */
+               ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids, 
+                                                                                                               max_xid, false);
+
+               if (preparedXact)
+               {
+                       /*
+                        * Commit prepared xlog records do not carry invalidation data,
+                        * since this is already held within the two phase state file.
+                        * So we read it from there instead, with much the same effects.
+                        */
+                       ProcessTwoPhaseStandbyRecords(xid);
+               }
+               else
+               {
+                       /*
+                        * Send any cache invalidations attached to the commit. We must
+                        * maintain the same order of invalidation then release locks
+                        * as occurs in RecordTransactionCommit.
+                        */
+                       if (xlrec->nmsgs > 0)
+                       {
+                               int     offset = OffsetSharedInvalInXactCommit();
+                               SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+                                                               (((char *) xlrec) + offset);
+
+                               /*
+                                * Relcache init file invalidation requires processing both
+                                * before and after we send the SI messages. See AtEOXact_Inval()
+                                */
+                               if (XactCompletionRelcacheInitFileInval(xlrec))
+                                       RelationCacheInitFileInvalidate(true);
+
+                               SendSharedInvalidMessages(msgs, xlrec->nmsgs);
+
+                               if (XactCompletionRelcacheInitFileInval(xlrec))
+                                       RelationCacheInitFileInvalidate(false);
+                       }
+               }
+
+               /*
+                * Release locks, if any. We do this for both two phase and normal
+                * one phase transactions. In effect we are ignoring the prepare
+                * phase and just going straight to lock release. This explains
+                * why the twophase_postcommit_standby_callbacks[] do not invoke
+                * a special routine to handle locks - that is performed here
+                * instead.
+                */
+               RelationReleaseRecoveryLockTree(xid, xlrec->nsubxacts, sub_xids);
         }
+
+       /* Make sure nextXid is beyond any XID mentioned in the record */
+       /* We don't expect anyone else to modify nextXid, hence we
+        * don't need to hold a lock while checking this. We still acquire
+        * the lock to modify it, though.
+        */
         if (TransactionIdFollowsOrEquals(max_xid,
                                                                          ShmemVariableCache->nextXid))
         {
+               LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
                 ShmemVariableCache->nextXid = max_xid;
                 TransactionIdAdvance(ShmemVariableCache->nextXid);
+               LWLockRelease(XidGenLock);
+       }
+
+       /* Same here, don't use lock to test, but need one to modify */
+       if (TransactionIdFollowsOrEquals(max_xid,
+                                                                        ShmemVariableCache->latestCompletedXid))
+       {
+               LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+               ShmemVariableCache->latestCompletedXid = max_xid;
+               LWLockRelease(ProcArrayLock);
         }
  
         /* Make sure files supposed to be dropped are dropped */
@@ -4241,6 +4656,15 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
         }
  }
  
+/*
+ * Be careful with the order of execution, as with xact_redo_commit().
+ * The two functions are similar but differ in key places.
+ *
+ * Note also that an abort can be for a subtransaction and its children,
+ * not just for a top level abort. That means we have to consider
+ * topxid != xid, whereas in commit we would find topxid == xid always
+ * because subtransaction commit is never WAL logged.
+ */
  static void
  xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
  {
@@ -4248,21 +4672,45 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
         TransactionId max_xid;
         int                     i;
  
-       /* Mark the transaction aborted in pg_clog */
         sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+       max_xid = TransactionIdLatest(xid, xlrec->nsubxacts, sub_xids);
+
+       /* Mark the transaction aborted in pg_clog, no need for async stuff */
         TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
  
-       /* Make sure nextXid is beyond any XID mentioned in the record */
-       max_xid = xid;
-       for (i = 0; i < xlrec->nsubxacts; i++)
+       if (InHotStandby)
         {
-               if (TransactionIdPrecedes(max_xid, sub_xids[i]))
-                       max_xid = sub_xids[i];
+               /*
+                * Record any newly known assigned transactions. This looks
+                * strange to add xids and then immediately remove them, but
+                * we do other important processing here also do don't remove
+                * them (again).
+                */
+               RecordKnownAssignedTransactionIds(max_xid);
+
+               /*
+                * We must mark clog before we update the ProcArray.
+                */
+               ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids, 
+                                                                                                               max_xid, false);
+
+               /*
+                * There are no flat files that need updating, nor invalidation
+                * messages to send or undo.
+                */
+
+               /*
+                * Release locks, if any. There are no invalidations to send.
+                */
+               RelationReleaseRecoveryLockTree(xid, xlrec->nsubxacts, sub_xids);
         }
+
+       /* Make sure nextXid is beyond any XID mentioned in the record */
         if (TransactionIdFollowsOrEquals(max_xid,
                                                                          ShmemVariableCache->nextXid))
         {
                 ShmemVariableCache->nextXid = max_xid;
+               ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
                 TransactionIdAdvance(ShmemVariableCache->nextXid);
         }
  
@@ -4284,6 +4732,43 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
         }
  }
  
+static void
+xact_redo_assignment(XLogRecPtr lsn, xl_xact_assignment *xlrec)
+{
+       TransactionId max_xid;
+       int             i;
+
+       Assert(InHotStandby);
+
+       /*
+        * Notice that we update pg_subtrans with the top-level xid, rather
+        * than the parent xid. This is a difference between normal
+        * processing and recovery, yet is still correct in all cases. The
+        * reason is that subtransaction commit is not marked in clog until
+        * commit processing, so all aborted subtransactions have already been
+        * clearly marked in clog. As a result we are able to refer directly
+        * to the top-level transaction's state rather than skipping through
+        * all the intermediate states in the subtransaction tree.
+        */
+       for (i = 0; i < xlrec->nsubxacts; i++)
+       {
+               TransactionId subxid = xlrec->xsub[i];
+
+               ExtendSUBTRANS(subxid);
+               SubTransSetParent(subxid, xlrec->xtop);
+       }
+
+       max_xid = TransactionIdLatest(xlrec->xtop, xlrec->nsubxacts, xlrec->xsub);
+
+       /*
+        * Remove the subxids from the array, which must occur after we have
+        * set their parents correctly in subtrans. Record overflowed state.
+        */
+       ExpireTreeKnownAssignedTransactionIds(InvalidTransactionId, 
+                                                                               xlrec->nsubxacts, xlrec->xsub, 
+                                                                               max_xid, true);
+}
+
  void
  xact_redo(XLogRecPtr lsn, XLogRecord *record)
  {
@@ -4296,7 +4781,7 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
         {
                 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
  
-               xact_redo_commit(xlrec, record->xl_xid);
+               xact_redo_commit(xlrec, record->xl_xid, lsn, false);
         }
         else if (info == XLOG_XACT_ABORT)
         {
@@ -4306,6 +4791,9 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
         }
         else if (info == XLOG_XACT_PREPARE)
         {
+               if (InHotStandby)
+                       RecordKnownAssignedTransactionIds(record->xl_xid);
+
                 /* the record contents are exactly the 2PC file */
                 RecreateTwoPhaseFile(record->xl_xid,
                                                          XLogRecGetData(record), record->xl_len);
@@ -4314,7 +4802,7 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
         {
                 xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record);
  
-               xact_redo_commit(&xlrec->crec, xlrec->xid);
+               xact_redo_commit(&xlrec->crec, xlrec->xid, lsn, true);
                 RemoveTwoPhaseFile(xlrec->xid, false);
         }
         else if (info == XLOG_XACT_ABORT_PREPARED)
@@ -4324,6 +4812,20 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
                 xact_redo_abort(&xlrec->arec, xlrec->xid);
                 RemoveTwoPhaseFile(xlrec->xid, false);
         }
+       else if (info == XLOG_XACT_ASSIGNMENT)
+       {
+               xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
+
+               if (InHotStandby)
+                       xact_redo_assignment(lsn, xlrec);
+       }
+       else if (info == XLOG_XACT_RUNNING_XACTS)
+       {
+               xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) XLogRecGetData(record);
+
+               if (InHotStandby)
+                       ProcArrayApplyRecoveryInfo(lsn, xlrec);
+       }
         else
                 elog(PANIC, "xact_redo: unknown op code %u", info);
  }
@@ -4333,10 +4835,13 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
  {
         int                     i;
  
+       if (XactCompletionRelcacheInitFileInval(xlrec))
+               appendStringInfo(buf, "; relcache init file inval");
+
         appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
         if (xlrec->nrels > 0)
         {
-               appendStringInfo(buf, "; rels:");
+               appendStringInfo(buf, "; %d rels:", xlrec->nrels);
                 for (i = 0; i < xlrec->nrels; i++)
                 {
                         char       *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM);
@@ -4348,12 +4853,34 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
         if (xlrec->nsubxacts > 0)
         {
                 TransactionId *xacts = (TransactionId *)
-               &xlrec->xnodes[xlrec->nrels];
-
-               appendStringInfo(buf, "; subxacts:");
+                                                                       &xlrec->xnodes[xlrec->nrels];
+               appendStringInfo(buf, "; %d subxacts:", xlrec->nsubxacts);
                 for (i = 0; i < xlrec->nsubxacts; i++)
                         appendStringInfo(buf, " %u", xacts[i]);
         }
+       if (xlrec->nmsgs > 0)
+       {
+               /*
+                * The invalidation messages are the third variable length array
+                * from the start of the record. The record header has everything
+                * we need to calculate where that starts.
+                */
+               int     offset = OffsetSharedInvalInXactCommit();
+               SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+                                               (((char *) xlrec) + offset);
+               appendStringInfo(buf, "; %d inval msgs:", xlrec->nmsgs);
+               for (i = 0; i < xlrec->nmsgs; i++)
+               {
+                       SharedInvalidationMessage *msg = msgs + i;
+
+                       if (msg->id >= 0)
+                               appendStringInfo(buf,  "catcache id%d ", msg->id);
+                       else if (msg->id == SHAREDINVALRELCACHE_ID)
+                               appendStringInfo(buf,  "relcache ");
+                       else if (msg->id == SHAREDINVALSMGR_ID)
+                               appendStringInfo(buf,  "smgr ");
+               }
+       }
  }
  
  static void
@@ -4376,14 +4903,62 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
         if (xlrec->nsubxacts > 0)
         {
                 TransactionId *xacts = (TransactionId *)
-               &xlrec->xnodes[xlrec->nrels];
+                                                                       &xlrec->xnodes[xlrec->nrels];
  
-               appendStringInfo(buf, "; subxacts:");
+               appendStringInfo(buf, "; %d subxacts:", xlrec->nsubxacts);
                 for (i = 0; i < xlrec->nsubxacts; i++)
                         appendStringInfo(buf, " %u", xacts[i]);
         }
  }
  
+static void
+xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec)
+{
+       int             i;
+
+       appendStringInfo(buf, " nsubxids %u;", xlrec->nsubxacts);
+
+       for (i = 0; i < xlrec->nsubxacts; i++)
+               appendStringInfo(buf, " %u", xlrec->xsub[i]);
+}
+
+static void
+xact_desc_running_xacts(StringInfo buf, xl_xact_running_xacts *xlrec)
+{
+       int                             xid_index,
+                                       subxid_index;
+       TransactionId   *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+
+       appendStringInfo(buf, "nxids %u nsubxids %u latestRunningXid %u",
+                                                               xlrec->xcnt,
+                                                               xlrec->subxcnt,
+                                                               xlrec->latestRunningXid);
+
+       appendStringInfo(buf, " oldestRunningXid %u latestCompletedXid %u",
+                                                               xlrec->oldestRunningXid,
+                                                               xlrec->latestCompletedXid);
+
+       for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+       {
+               RunningXact             *rxact = (RunningXact *) xlrec->xrun;
+
+               appendStringInfo(buf, " %u", rxact[xid_index].xid);
+
+               if (rxact[xid_index].nsubxids > 0)
+               {
+                       appendStringInfo(buf, "; nsubxids %u offset %u ovflow? %s",
+                                                                       rxact[xid_index].nsubxids,
+                                                                       rxact[xid_index].subx_offset,
+                                                                       (rxact[xid_index].overflowed ? "t" : "f"));
+
+                       appendStringInfo(buf, "; subxacts: ");
+                       for (subxid_index = 0; subxid_index < rxact[xid_index].nsubxids; subxid_index++)
+                               appendStringInfo(buf, " %u",
+                                               subxip[subxid_index + rxact[xid_index].subx_offset]);
+               }
+       }
+}
+
  void
  xact_desc(StringInfo buf, uint8 xl_info, char *rec)
  {
@@ -4411,16 +4986,35 @@ xact_desc(StringInfo buf, uint8 xl_info, char *rec)
         {
                 xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) rec;
  
-               appendStringInfo(buf, "commit %u: ", xlrec->xid);
+               appendStringInfo(buf, "commit prepared %u: ", xlrec->xid);
                 xact_desc_commit(buf, &xlrec->crec);
         }
         else if (info == XLOG_XACT_ABORT_PREPARED)
         {
                 xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) rec;
  
-               appendStringInfo(buf, "abort %u: ", xlrec->xid);
+               appendStringInfo(buf, "abort prepared %u: ", xlrec->xid);
                 xact_desc_abort(buf, &xlrec->arec);
         }
+       else if (info == XLOG_XACT_ASSIGNMENT)
+       {
+               xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
+
+               /* 
+                * Note that we ignore the WAL record's xid, since we're more
+                * interested in the top-level xid that issued the record
+                * and which xids are being reported here.
+                */
+               appendStringInfo(buf, "xid assignment xtop %u", xlrec->xtop);
+               xact_desc_assignment(buf, xlrec);
+       }
+       else if (info == XLOG_XACT_RUNNING_XACTS)
+       {
+               xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) rec;
+
+               appendStringInfo(buf, "running xacts: ");
+               xact_desc_running_xacts(buf, xlrec);
+       }
         else
                 appendStringInfo(buf, "UNKNOWN");
  }
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 90b89b89bc9f4c4a0ad4b6977991fd65a84863fe..f598d5392c1d00b60778b1eaa872d9cba2fae509 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -25,6 +25,7 @@
  
  #include "access/clog.h"
  #include "access/multixact.h"
+#include "access/nbtree.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
  #include "access/tuptoaster.h"
@@ -46,6 +47,7 @@
  #include "storage/ipc.h"
  #include "storage/pmsignal.h"
  #include "storage/procarray.h"
+#include "storage/sinval.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/builtins.h"
@@ -53,13 +55,14 @@
  #include "utils/ps_status.h"
  #include "pg_trace.h"
  
-
  /* File path names (all relative to $PGDATA) */
  #define BACKUP_LABEL_FILE              "backup_label"
  #define BACKUP_LABEL_OLD               "backup_label.old"
  #define RECOVERY_COMMAND_FILE  "recovery.conf"
  #define RECOVERY_COMMAND_DONE  "recovery.done"
  
+/* copied from tcopprot.h rather than include whole file */
+extern int     PostAuthDelay;
  
  /* User-settable parameters */
  int                    CheckPointSegments = 3;
@@ -133,6 +136,8 @@ TimeLineID  ThisTimeLineID = 0;
   */
  bool           InRecovery = false;
  
+static         XLogRecPtr      LastRec;
+
  /*
   * Local copy of SharedRecoveryInProgress variable. True actually means "not
   * known, need to check the shared state".
@@ -152,7 +157,7 @@ static bool LocalRecoveryInProgress = true;
  static int     LocalXLogInsertAllowed = -1;
  
  /* Are we recovering using offline XLOG archives? */
-static bool InArchiveRecovery = false;
+bool InArchiveRecovery = false;
  
  /* Was the last xlog file restored from archive, or local? */
  static bool restoredFromArchive = false;
@@ -160,12 +165,38 @@ static bool restoredFromArchive = false;
  /* options taken from recovery.conf */
  static char *recoveryRestoreCommand = NULL;
  static char *recoveryEndCommand = NULL;
-static bool recoveryTarget = false;
  static bool recoveryTargetExact = false;
  static bool recoveryTargetInclusive = true;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
+static XLogRecPtr recoveryTargetLSN;
+static int recoveryTargetAdvance = 0;
+
+/*
+ * InHotStandby is an optional sub-state of InArchiveRecovery
+ * so is only ever set in the Startup process.  Recovery connections 
+ * will not be enabled until a valid RunningXact record has arrived.
+ */
+bool InHotStandby = true;
+
+/* recovery target modes */
+#define RECOVERY_TARGET_NONE                           0
+#define RECOVERY_TARGET_PAUSE_ALL                      1
+#define RECOVERY_TARGET_PAUSE_XID                      2
+#define RECOVERY_TARGET_PAUSE_TIME                     3
+#define RECOVERY_TARGET_PAUSE_LSN                      4
+#define RECOVERY_TARGET_ADVANCE                                5
+#define RECOVERY_TARGET_STOP_IMMEDIATE         6
+#define RECOVERY_TARGET_STOP_XID                       7
+#define RECOVERY_TARGET_STOP_TIME                      8
+static int recoveryTargetMode = RECOVERY_TARGET_NONE;
+static bool recoveryStartsPaused = false;
+
+#define DEFAULT_MAX_STANDBY_DELAY      30
+int maxStandbyDelay = DEFAULT_MAX_STANDBY_DELAY; /* initial setting, seconds */
+
  static TimestampTz recoveryLastXTime = 0;
+static TransactionId recoveryLastXid = InvalidTransactionId;
  
  /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
  static TransactionId recoveryStopXid;
@@ -360,6 +391,17 @@ typedef struct XLogCtlData
         /* end+1 of the last record replayed (or being replayed) */
         XLogRecPtr      replayEndRecPtr;
  
+       int                             recoveryTargetMode;
+       TransactionId   recoveryTargetXid;
+       TimestampTz             recoveryTargetTime;
+       int                             recoveryTargetAdvance;
+       XLogRecPtr              recoveryTargetLSN;
+
+       TimestampTz     recoveryLastXTime;
+       TransactionId   recoveryLastXid;
+       XLogRecPtr              recoveryLastRecPtr;
+       int                     maxStandbyDelay;
+
         slock_t         info_lck;               /* locks shared variables shown above */
  } XLogCtlData;
  
@@ -463,6 +505,12 @@ static void readRecoveryCommandFile(void);
  static void exitArchiveRecovery(TimeLineID endTLI,
                                         uint32 endLogId, uint32 endLogSeg);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
+static void recoveryPausesAfterLSN(void);
+static void RequiresWALControlPermissions(void);
+static void SetRecoveryTargetMode(int mode, TransactionId xid, TimestampTz ts,
+                                               XLogRecPtr lsn, int advance);
+static void SetMaxStandbyDelay(int delay);
+static void CheckMaxConnections(int maxcon);
  static void LocalSetXLogInsertAllowed(void);
  static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
  
@@ -899,25 +947,6 @@ begin:;
         FIN_CRC32(rdata_crc);
         record->xl_crc = rdata_crc;
  
-#ifdef WAL_DEBUG
-       if (XLOG_DEBUG)
-       {
-               StringInfoData buf;
-
-               initStringInfo(&buf);
-               appendStringInfo(&buf, "INSERT @ %X/%X: ",
-                                                RecPtr.xlogid, RecPtr.xrecoff);
-               xlog_outrec(&buf, record);
-               if (rdata->data != NULL)
-               {
-                       appendStringInfo(&buf, " - ");
-                       RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
-               }
-               elog(LOG, "%s", buf.data);
-               pfree(buf.data);
-       }
-#endif
-
         /* Record begin of record in appropriate places */
         ProcLastRecPtr = RecPtr;
         Insert->PrevRecord = RecPtr;
@@ -2128,6 +2157,45 @@ XLogNeedsFlush(XLogRecPtr record)
         return true;
  }
  
+/*
+ * Test whether database is consistent up to the LSN requested, specifically
+ * to decide whether it is safe to SetHintBits().
+ *
+ * Works whether we are in recovery or not.
+ */
+bool
+DBConsistentUpToLSN(XLogRecPtr lsn)
+{
+       /*
+        * If we are operating normally, just check is WAL needs flushing.
+        */
+       if (!RecoveryInProgress())
+               return !XLogNeedsFlush(lsn);
+
+       /* Quick exit if already known flushed */
+       if (XLByteLE(lsn, minRecoveryPoint))
+               return true;
+
+       /* 
+        * Don't try too hard to check consistency, it's not updated
+        * that often during recovery so this could easily become a
+        * contention hotspot with many users and many CPUs.
+        */ 
+       if (LWLockConditionalAcquire(ControlFileLock, LW_EXCLUSIVE))
+       {
+               /* update local copy */
+               minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+               LWLockRelease(ControlFileLock);
+
+               /* check again */
+               if (XLByteLE(lsn, minRecoveryPoint))
+                       return true;
+       }
+
+       return false;
+}
+
  /*
   * Create a new XLOG file segment, or open a pre-existing one.
   *
@@ -4893,7 +4961,7 @@ readRecoveryCommandFile(void)
                         ereport(LOG,
                                         (errmsg("recovery_target_xid = %u",
                                                         recoveryTargetXid)));
-                       recoveryTarget = true;
+                       recoveryTargetMode = RECOVERY_TARGET_STOP_XID;
                         recoveryTargetExact = true;
                 }
                 else if (strcmp(tok1, "recovery_target_time") == 0)
@@ -4904,7 +4972,7 @@ readRecoveryCommandFile(void)
                          */
                         if (recoveryTargetExact)
                                 continue;
-                       recoveryTarget = true;
+                       recoveryTargetMode = RECOVERY_TARGET_STOP_TIME;
                         recoveryTargetExact = false;
  
                         /*
@@ -4931,6 +4999,44 @@ readRecoveryCommandFile(void)
                         ereport(LOG,
                                         (errmsg("recovery_target_inclusive = %s", tok2)));
                 }
+               else if (strcmp(tok1, "recovery_connections") == 0)
+               {
+                       /*
+                        * enables/disables snapshot processing and user connections
+                        */
+                       if (!parse_bool(tok2, &InHotStandby))
+                                 ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                         errmsg("parameter \"recovery_connections\" requires a Boolean value")));
+                       ereport(LOG,
+                                       (errmsg("recovery_connections = %s", tok2)));
+               }
+               else if (strcmp(tok1, "recovery_starts_paused") == 0)
+               {
+                       /*
+                        * enables/disables snapshot processing and user connections
+                        */
+                       if (!parse_bool(tok2, &recoveryStartsPaused))
+                                 ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                         errmsg("parameter \"recovery_starts_paused\" requires a Boolean value")));
+
+                       ereport(LOG,
+                                       (errmsg("recovery_starts_paused = %s", tok2)));
+               }
+               else if (strcmp(tok1, "max_standby_delay") == 0)
+               {
+                       errno = 0;
+                       maxStandbyDelay = (TransactionId) strtoul(tok2, NULL, 0);
+                       if (errno == EINVAL || errno == ERANGE)
+                               ereport(FATAL,
+                                (errmsg("max_standby_delay is not a valid number: \"%s\"",
+                                                tok2)));
+
+                       ereport(LOG,
+                                       (errmsg("max_standby_delay = %u",
+                                                       maxStandbyDelay)));
+               }
                 else
                         ereport(FATAL,
                                         (errmsg("unrecognized recovery parameter \"%s\"",
@@ -4954,6 +5060,11 @@ readRecoveryCommandFile(void)
         /* Enable fetching from archive recovery area */
         InArchiveRecovery = true;
  
+       if (InHotStandby)
+               SetMaxStandbyDelay(maxStandbyDelay);
+       else
+               recoveryStartsPaused = false;
+
         /*
          * If user specified recovery_target_timeline, validate it or compute the
          * "latest" value.      We can't do this until after we've gotten the restore
@@ -5098,8 +5209,8 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
  }
  
  /*
- * For point-in-time recovery, this function decides whether we want to
- * stop applying the XLOG at or after the current record.
+ * For archive recovery, this function decides whether we want to
+ * pause or stop applying the XLOG at or after the current record.
   *
   * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
   * *includeThis is set TRUE if we should apply this record before stopping.
@@ -5112,72 +5223,286 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
  static bool
  recoveryStopsHere(XLogRecord *record, bool *includeThis)
  {
-       bool            stopsHere;
-       uint8           record_info;
-       TimestampTz recordXtime;
+       bool            stopsHere = false;
+       bool            pauseHere = false;
+       static bool     paused = false;
+       uint8           record_info = 0;        /* valid iff (is_xact_completion_record) */
+       TimestampTz recordXtime = 0;
+       bool        is_xact_completion_record = false;
  
         /* We only consider stopping at COMMIT or ABORT records */
-       if (record->xl_rmid != RM_XACT_ID)
-               return false;
-       record_info = record->xl_info & ~XLR_INFO_MASK;
-       if (record_info == XLOG_XACT_COMMIT)
+       if (record->xl_rmid == RM_XACT_ID)
         {
-               xl_xact_commit *recordXactCommitData;
+               record_info = record->xl_info & ~XLR_INFO_MASK;
+               if (record_info == XLOG_XACT_COMMIT)
+               {
+                       xl_xact_commit *recordXactCommitData;
  
-               recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
-               recordXtime = recordXactCommitData->xact_time;
-       }
-       else if (record_info == XLOG_XACT_ABORT)
-       {
-               xl_xact_abort *recordXactAbortData;
+                       recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
+                       recordXtime = recordXactCommitData->xact_time;
+                       is_xact_completion_record = true;
+               }
+               else if (record_info == XLOG_XACT_ABORT)
+               {
+                       xl_xact_abort *recordXactAbortData;
  
-               recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
-               recordXtime = recordXactAbortData->xact_time;
-       }
-       else
-               return false;
+                       recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
+                       recordXtime = recordXactAbortData->xact_time;
+                       is_xact_completion_record = true;
+               }
  
-       /* Do we have a PITR target at all? */
-       if (!recoveryTarget)
-       {
-               recoveryLastXTime = recordXtime;
-               return false;
+               /* Remember the most recent COMMIT/ABORT time for logging purposes */
+               if (is_xact_completion_record)
+               {
+                       recoveryLastXTime = recordXtime;
+                       recoveryLastXid = record->xl_xid;
+               }
         }
  
-       if (recoveryTargetExact)
+       do
         {
+               int     prevRecoveryTargetMode = recoveryTargetMode;
+
+               CHECK_FOR_INTERRUPTS();
+
                 /*
-                * there can be only one transaction end record with this exact
-                * transactionid
-                *
-                * when testing for an xid, we MUST test for equality only, since
-                * transactions are numbered in the order they start, not the order
-                * they complete. A higher numbered xid will complete before you about
-                * 50% of the time...
+                * Check if we were requested to exit without finishing
+                * recovery.
                  */
-               stopsHere = (record->xl_xid == recoveryTargetXid);
-               if (stopsHere)
-                       *includeThis = recoveryTargetInclusive;
-       }
-       else
-       {
+               if (shutdown_requested)
+                       proc_exit(1);
+
                 /*
-                * there can be many transactions that share the same commit time, so
-                * we stop after the last one, if we are inclusive, or stop at the
-                * first one if we are exclusive
+                * Let's see if user has updated our recoveryTargetMode.
                  */
-               if (recoveryTargetInclusive)
-                       stopsHere = (recordXtime > recoveryTargetTime);
-               else
-                       stopsHere = (recordXtime >= recoveryTargetTime);
-               if (stopsHere)
-                       *includeThis = false;
+               {
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile XLogCtlData *xlogctl = XLogCtl;
+
+                       SpinLockAcquire(&xlogctl->info_lck);
+                       recoveryTargetMode = xlogctl->recoveryTargetMode;
+                       if (recoveryTargetMode != RECOVERY_TARGET_NONE)
+                       {
+                               recoveryTargetXid = xlogctl->recoveryTargetXid;
+                               recoveryTargetTime = xlogctl->recoveryTargetTime;
+
+                               /* Don't reset counter while we're advancing */
+                               if (recoveryTargetAdvance <= 0)
+                               {
+                                       recoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
+                                       xlogctl->recoveryTargetAdvance = 0;
+                               }
+                       }
+                       if (is_xact_completion_record)
+                       {
+                               xlogctl->recoveryLastXTime = recordXtime;
+                               xlogctl->recoveryLastXid = record->xl_xid;
+                       }
+                       xlogctl->recoveryLastRecPtr = LastRec;
+                       SpinLockRelease(&xlogctl->info_lck);
+               }
+
+               /* Decide how to act on any pause target */
+               switch (recoveryTargetMode)
+               {
+                       case RECOVERY_TARGET_PAUSE_LSN:
+                                       return false;
+
+                       case RECOVERY_TARGET_NONE:
+                                       /*
+                                        * If we aren't paused and we're not looking to stop,
+                                        * just exit out quickly and get on with recovery.
+                                        */
+                                       if (paused)
+                                       {
+                                               ereport(LOG,
+                                                               (errmsg("recovery restarting after pause")));
+                                               set_ps_display("recovery continues", false);
+                                               paused = false;
+                                       }
+                                       return false;
+
+                       case RECOVERY_TARGET_PAUSE_ALL:
+                                       pauseHere = true;
+                                       break;
+
+                       case RECOVERY_TARGET_ADVANCE:
+                                       if (paused)
+                                       {
+                                               if (recoveryTargetAdvance-- > 0)
+                                               {
+                                                       ereport(LOG,
+                                                                       (errmsg("recovery advancing 1 record")));
+                                                       return false;
+                                               }
+                                               else
+                                                       break;
+                                       }
+
+                                       if (recoveryTargetAdvance-- <= 0)
+                                               pauseHere = true;
+                                       break;
+
+                       case RECOVERY_TARGET_STOP_IMMEDIATE:
+                       case RECOVERY_TARGET_STOP_XID:
+                       case RECOVERY_TARGET_STOP_TIME:
+                                       paused = false;
+                                       break;
+
+                       /*
+                        * If we're paused, and mode has changed reset to allow new settings
+                        * to apply and maybe allow us to continue.
+                        */
+                       if (paused && prevRecoveryTargetMode != recoveryTargetMode)
+                               paused = false;
+
+                       case RECOVERY_TARGET_PAUSE_XID:
+                                       /*
+                                        * there can be only one transaction end record with this exact
+                                        * transactionid
+                                        *
+                                        * when testing for an xid, we MUST test for equality only, since
+                                        * transactions are numbered in the order they start, not the order
+                                        * they complete. A higher numbered xid will complete before you about
+                                        * 50% of the time...
+                                        */
+                                       if (is_xact_completion_record)
+                                               pauseHere = (record->xl_xid == recoveryTargetXid);
+                                       break;
+
+                       case RECOVERY_TARGET_PAUSE_TIME:
+                                       /*
+                                        * there can be many transactions that share the same commit time, so
+                                        * we pause after the last one, if we are inclusive, or pause at the
+                                        * first one if we are exclusive
+                                        */
+                                       if (is_xact_completion_record)
+                                       {
+                                               if (recoveryTargetInclusive)
+                                                       pauseHere = (recoveryLastXTime > recoveryTargetTime);
+                                               else
+                                                       pauseHere = (recoveryLastXTime >= recoveryTargetTime);
+                                       }
+                                       break;
+
+                       default:
+                                       ereport(WARNING,
+                                                       (errmsg("unknown recovery mode %d, continuing recovery",
+                                                                                       recoveryTargetMode)));
+                                       return false;
+               }
+
+               /*
+                * If we just entered pause, issue log messages
+                */
+               if (pauseHere && !paused)
+               {
+                       if (is_xact_completion_record)
+                       {
+                               if (record_info == XLOG_XACT_COMMIT)
+                                       ereport(LOG,
+                                               (errmsg("recovery pausing before commit of transaction %u, log time %s",
+                                                                       record->xl_xid,
+                                                                       timestamptz_to_str(recoveryLastXTime))));
+                               else
+                                       ereport(LOG,
+                                               (errmsg("recovery pausing before abort of transaction %u, log time %s",
+                                                                       record->xl_xid,
+                                                                       timestamptz_to_str(recoveryLastXTime))));
+                       }
+                       else
+                               ereport(LOG,
+                                               (errmsg("recovery pausing; last recovered transaction %u, "
+                                                               "last recovered xact timestamp %s",
+                                                                       recoveryLastXid,
+                                                                       timestamptz_to_str(recoveryLastXTime))));
+
+                       set_ps_display("recovery paused", false);
+
+                       paused = true;
+               }
+
+               /*
+                * Pause for a while before rechecking mode at top of loop.
+                */
+               if (paused)
+               {
+                       recoveryTargetAdvance = 0;
+
+                       /*
+                        * Update the recoveryTargetMode
+                        */
+                       {
+                               /* use volatile pointer to prevent code rearrangement */
+                               volatile XLogCtlData *xlogctl = XLogCtl;
+
+                               SpinLockAcquire(&xlogctl->info_lck);
+                               xlogctl->recoveryTargetMode = RECOVERY_TARGET_PAUSE_ALL;
+                               xlogctl->recoveryTargetAdvance = 0;
+                               SpinLockRelease(&xlogctl->info_lck);
+                       }
+
+                       pg_usleep(200000L);
+               }
+
+               /*
+                * We leave the loop at the bottom only if our recovery mode is
+                * set (or has been recently reset) to one of the stop options.
+                */
+       } while (paused);
+
+       /*
+        * Decide how to act if stop target mode set. We run this separately from
+        * pause to allow user to reset their stop target while paused.
+        */
+       switch (recoveryTargetMode)
+       {
+               case RECOVERY_TARGET_STOP_IMMEDIATE:
+                               ereport(LOG,
+                                               (errmsg("recovery stopping immediately due to user request")));
+                               return true;
+
+               case RECOVERY_TARGET_STOP_XID:
+                               /*
+                                * there can be only one transaction end record with this exact
+                                * transactionid
+                                *
+                                * when testing for an xid, we MUST test for equality only, since
+                                * transactions are numbered in the order they start, not the order
+                                * they complete. A higher numbered xid will complete before you about
+                                * 50% of the time...
+                                */
+                               if (is_xact_completion_record)
+                               {
+                                       stopsHere = (record->xl_xid == recoveryTargetXid);
+                                       if (stopsHere)
+                                               *includeThis = recoveryTargetInclusive;
+                               }
+                               break;
+
+               case RECOVERY_TARGET_STOP_TIME:
+                               /*
+                                * there can be many transactions that share the same commit time, so
+                                * we stop after the last one, if we are inclusive, or stop at the
+                                * first one if we are exclusive
+                                */
+                               if (is_xact_completion_record)
+                               {
+                                       if (recoveryTargetInclusive)
+                                               stopsHere = (recoveryLastXTime > recoveryTargetTime);
+                                       else
+                                               stopsHere = (recoveryLastXTime >= recoveryTargetTime);
+                                       if (stopsHere)
+                                               *includeThis = false;
+                               }
+                               break;
         }
  
         if (stopsHere)
         {
+               Assert(is_xact_completion_record);
                 recoveryStopXid = record->xl_xid;
-               recoveryStopTime = recordXtime;
+               recoveryStopTime = recoveryLastXTime;
                 recoveryStopAfter = *includeThis;
  
                 if (record_info == XLOG_XACT_COMMIT)
@@ -5206,14 +5531,427 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
                                                                 recoveryStopXid,
                                                                 timestamptz_to_str(recoveryStopTime))));
                 }
+       }
  
-               if (recoveryStopAfter)
-                       recoveryLastXTime = recordXtime;
+       return stopsHere;
+}
+
+static void
+recoveryPausesAfterLSN(void)
+{
+       while (recoveryTargetMode == RECOVERY_TARGET_PAUSE_LSN &&
+                       XLByteLE(recoveryTargetLSN, LastRec))
+       {
+               {
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile XLogCtlData *xlogctl = XLogCtl;
+
+                       SpinLockAcquire(&xlogctl->info_lck);
+                       recoveryTargetMode = xlogctl->recoveryTargetMode;
+                       recoveryTargetLSN = xlogctl->recoveryTargetLSN;
+                       SpinLockRelease(&xlogctl->info_lck);
+               }
+
+               pg_usleep(100000L);
+       }
+}
+
+static void
+RequiresWALControlPermissions(void) 
+{
+       if (!RecoveryInProgress())
+               ereport(ERROR,
+                       (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                        errmsg("can only be executed during recovery")));
+
+       if (!superuser())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                                errmsg("must be superuser to control recovery")));
+}
+
+/*
+ * Utility function used by various user functions to set the recovery
+ * target mode. This allows user control over the progress of recovery.
+ */
+static void
+SetRecoveryTargetMode(int mode, TransactionId xid, TimestampTz ts,
+                                               XLogRecPtr lsn, int advance)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       xlogctl->recoveryTargetMode = mode;
+
+       switch (mode)
+       {
+               case RECOVERY_TARGET_STOP_XID:
+               case RECOVERY_TARGET_PAUSE_XID:
+                               xlogctl->recoveryTargetXid = xid;
+                               break;
+               case RECOVERY_TARGET_STOP_TIME:
+               case RECOVERY_TARGET_PAUSE_TIME:
+                               xlogctl->recoveryTargetTime = ts;
+                               break;
+               case RECOVERY_TARGET_ADVANCE:
+                               xlogctl->recoveryTargetAdvance = advance;
+                               break;
+               case RECOVERY_TARGET_PAUSE_LSN:
+                               xlogctl->recoveryTargetLSN = lsn;
+                               break;
+               default:
+                               break;
         }
+
+       SpinLockRelease(&xlogctl->info_lck);
+}
+
+/*
+ * Forces recovery mode to reset to unfrozen.
+ * Returns void.
+ */
+Datum
+pg_recovery_continue(PG_FUNCTION_ARGS)
+{
+       RequiresWALControlPermissions();
+       SetRecoveryTargetMode(RECOVERY_TARGET_NONE,
+                                                       InvalidTransactionId, 0, InvalidXLogRecPtr, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery immediately. Stays paused until asked to play again.
+ * Returns void.
+ */
+Datum
+pg_recovery_pause(PG_FUNCTION_ARGS)
+{
+       RequiresWALControlPermissions();
+       SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_ALL,
+                                                       InvalidTransactionId, 0, InvalidXLogRecPtr, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery at stated xid, if ever seen. Once paused, stays paused
+ * until asked to play again.
+ */
+Datum
+pg_recovery_pause_xid(PG_FUNCTION_ARGS)
+{
+       int                       xidi = PG_GETARG_INT32(0);
+       TransactionId xid = (TransactionId) xidi;
+
+       RequiresWALControlPermissions();
+
+       if (xid < 3)
+               elog(ERROR, "cannot specify special values for transaction id");
+
+       SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_XID,
+                                                       xid, 0, InvalidXLogRecPtr, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery at stated timestamp, if ever reached. Once paused, stays paused
+ * until asked to play again.
+ */
+Datum
+pg_recovery_pause_timestamp(PG_FUNCTION_ARGS)
+{
+       TimestampTz ts = PG_GETARG_TIMESTAMPTZ(0);
+
+       RequiresWALControlPermissions();
+       SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_TIME,
+                                                       InvalidTransactionId, ts, InvalidXLogRecPtr, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery after stated LSN, if ever reached. Once paused, stays paused
+ * until asked to play again.
+ */
+Datum
+pg_recovery_pause_location(PG_FUNCTION_ARGS)
+{
+       text       *location = PG_GETARG_TEXT_P(0);
+       char       *locationstr;
+       unsigned int uxlogid;
+       unsigned int uxrecoff;
+       XLogRecPtr lsn;
+
+       RequiresWALControlPermissions();
+
+       /*
+        * Read input and parse
+        */
+       locationstr = text_to_cstring(location);
+
+       if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("could not parse transaction log location \"%s\"",
+                                               locationstr)));
+
+       lsn.xlogid = uxlogid;
+       lsn.xrecoff = uxrecoff;
+
+       SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_LSN,
+                                                       InvalidTransactionId, 0, lsn, 0);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * If paused, advance N records.
+ */
+Datum
+pg_recovery_advance(PG_FUNCTION_ARGS)
+{
+       int adv = PG_GETARG_INT32(0);
+
+       RequiresWALControlPermissions();
+
+       if (adv < 1)
+               elog(ERROR, "recovery advance must be greater than or equal to 1");
+
+       SetRecoveryTargetMode(RECOVERY_TARGET_ADVANCE,
+                                                       InvalidTransactionId, 0, InvalidXLogRecPtr, adv);
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Forces recovery to stop now if paused, or at end of next record if playing.
+ */
+Datum
+pg_recovery_stop(PG_FUNCTION_ARGS)
+{
+       RequiresWALControlPermissions();
+       SetRecoveryTargetMode(RECOVERY_TARGET_STOP_IMMEDIATE,
+                                                       InvalidTransactionId, 0, InvalidXLogRecPtr, 0);
+
+       PG_RETURN_VOID();
+}
+
+Datum
+pg_current_recovery_target(PG_FUNCTION_ARGS)
+{
+       StringInfoData buf;
+
+       RequiresWALControlPermissions();
+
+       initStringInfo(&buf);
+
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->info_lck);
+
+               recoveryTargetMode = xlogctl->recoveryTargetMode;
+               if (recoveryTargetMode != RECOVERY_TARGET_NONE)
+               {
+                       recoveryTargetXid = xlogctl->recoveryTargetXid;
+                       recoveryTargetTime = xlogctl->recoveryTargetTime;
+                       recoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
+                       recoveryTargetLSN = xlogctl->recoveryTargetLSN;
+               }
+
+               SpinLockRelease(&xlogctl->info_lck);
+       }
+
+       switch (recoveryTargetMode)
+       {
+               case RECOVERY_TARGET_NONE:
+                               appendStringInfo(&buf, "No recovery target has been set");
+                               break;
+               case RECOVERY_TARGET_PAUSE_ALL:
+                               appendStringInfo(&buf, "Recovery paused");
+                               break;
+               case RECOVERY_TARGET_PAUSE_XID:
+                               appendStringInfo(&buf, "Recovery will pause after commit of transaction %u", recoveryTargetXid);
+                               break;
+               case RECOVERY_TARGET_PAUSE_TIME:
+                               appendStringInfo(&buf, "Recovery will pause after transaction completion timestamp %s",
+                                                                               timestamptz_to_str(recoveryTargetTime));
+                               break;
+               case RECOVERY_TARGET_PAUSE_LSN:
+                               appendStringInfo(&buf, "Recovery will pause after applying record at xlog location %X/%X",
+                                                                               recoveryTargetLSN.xlogid,
+                                                                               recoveryTargetLSN.xrecoff);
+                               break;
+               case RECOVERY_TARGET_ADVANCE:
+                               appendStringInfo(&buf, "Recovery will advance");
+                               break;
+               case RECOVERY_TARGET_STOP_IMMEDIATE:
+                               appendStringInfo(&buf, "No recovery target has been set");
+                               break;
+               case RECOVERY_TARGET_STOP_XID:
+                               appendStringInfo(&buf, "Recovery will stop after commit of transaction %u", recoveryTargetXid);
+                               break;
+               case RECOVERY_TARGET_STOP_TIME:
+                               appendStringInfo(&buf, "Recovery will stop after transaction completion timestamp %s",
+                                                                               timestamptz_to_str(recoveryTargetTime));
+                               break;
+       }
+
+       PG_RETURN_TEXT_P(cstring_to_text(buf.data));
+}
+
+/*
+ * Returns bool with current recovery mode, a global state.
+ */
+Datum
+pg_is_in_recovery(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_BOOL(RecoveryInProgress());
+}
+
+/*
+ * Returns timestamp of last completed transaction
+ */
+Datum
+pg_last_recovered_xact_timestamp(PG_FUNCTION_ARGS)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       recoveryLastXTime = xlogctl->recoveryLastXTime;
+       SpinLockRelease(&xlogctl->info_lck);
+
+       PG_RETURN_TIMESTAMPTZ(recoveryLastXTime);
+}
+
+/*
+ * Returns xid of last completed transaction
+ */
+Datum
+pg_last_recovered_xid(PG_FUNCTION_ARGS)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       recoveryLastXid = xlogctl->recoveryLastXid;
+       SpinLockRelease(&xlogctl->info_lck);
+
+       PG_RETURN_INT32(recoveryLastXid);
+}
+
+/*
+ * Returns xlog location of last recovered WAL record.
+ */
+Datum
+pg_last_recovered_xlog_location(PG_FUNCTION_ARGS)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+       char            location[MAXFNAMELEN];
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       LastRec = xlogctl->recoveryLastRecPtr;
+       SpinLockRelease(&xlogctl->info_lck);
+
+       snprintf(location, sizeof(location), "%X/%X",
+                        LastRec.xlogid, LastRec.xrecoff);
+       PG_RETURN_TEXT_P(cstring_to_text(location));
+}
+
+/*
+ * Returns delay in milliseconds, or -1 if delay too large
+ */
+int
+GetLatestReplicationDelay(void)
+{
+       long            delay_secs;
+       int                     delay_usecs;
+       int                     delay;
+       TimestampTz currTz = GetCurrentTimestamp();
+
+       TimestampDifference(recoveryLastXTime, currTz,
+                                               &delay_secs, &delay_usecs);
+
+       /*
+        * If delay is very large we probably aren't looking at
+        * a replication situation at all, just a recover from backup.
+        * So return a special value instead.
+        */
+       if (delay_secs > (long)(INT_MAX / 1000))
+               delay = -1;
         else
-               recoveryLastXTime = recordXtime;
+               delay = (int)(delay_secs * 1000) + (delay_usecs / 1000);
  
-       return stopsHere;
+       return delay;
+}
+
+/*
+ * Returns maxStandbyDelay in milliseconds, or -1 if wait forever
+ */
+int
+GetMaxStandbyDelay(void)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+       int             delay;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       delay = xlogctl->maxStandbyDelay;
+       SpinLockRelease(&xlogctl->info_lck);
+
+       return delay;
+}
+
+static void
+SetMaxStandbyDelay(int delay)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       /*
+        * 2E6 seconds is about 23 days. Allows us to measure delay in
+        * milliseconds when we perform timing. maxStandbyDelay is 
+        * specified here in seconds.
+        */
+       if (delay > INT_MAX || delay < -1)
+               ereport(FATAL,
+                (errmsg("max_standby_delay must be between -1 (wait forever) and 2 000 000 secs")));
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       xlogctl->maxStandbyDelay = delay;
+       SpinLockRelease(&xlogctl->info_lck);
+}
+
+Datum
+pg_recovery_max_standby_delay(PG_FUNCTION_ARGS)
+{
+       int             delay = PG_GETARG_INT32(0);
+
+       RequiresWALControlPermissions();
+       SetMaxStandbyDelay(1000 * delay); /* save in milliseconds */
+
+       PG_RETURN_VOID();
+}
+
+/*
+ * Check to see if max_connections is set high enough on this server
+ * to allow recovery connections to operate correctly. We ignore
+ * autovacuum_max_workers when we make this test.
+ */
+static void
+CheckMaxConnections(int maxcon)
+{
+       if (MaxConnections < maxcon)
+               ereport(ERROR,
+                       (errmsg("recovery_connections cannot continue because"
+                                       "max_connections %u set lower than WAL master (max_connections = %u)",
+                                       MaxConnections, maxcon)));
  }
  
  /*
@@ -5228,7 +5966,6 @@ StartupXLOG(void)
         bool            reachedStopPoint = false;
         bool            haveBackupLabel = false;
         XLogRecPtr      RecPtr,
-                               LastRec,
                                 checkPointLoc,
                                 backupStopLoc,
                                 EndOfLog;
@@ -5313,6 +6050,16 @@ StartupXLOG(void)
          */
         readRecoveryCommandFile();
  
+       /*
+        * PostAuthDelay is a debugging aid for investigating problems in startup
+        * and/or recovery: it can be set in postgresql.conf to allow time to
+        * attach to the newly-forked backend with a debugger. It can also be set
+        * using the postmaster -W switch, which can be specified using the -o
+        * option of pg_ctl, e.g. pg_ctl -D data -o "-W 30"
+        */
+       if (PostAuthDelay > 0)
+               pg_usleep(PostAuthDelay * 1000000L);
+
         /* Now we can determine the list of expected TLIs */
         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
  
@@ -5506,6 +6253,17 @@ StartupXLOG(void)
                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
                 }
  
+               /* Initialize recovery connections, if enabled */               
+               if (InHotStandby)
+               {
+                       CheckMaxConnections(checkPoint.MaxConnections);
+                       InitRecoveryTransactionEnvironment();
+                       StartCleanupDelayStats();
+                       if (recoveryStartsPaused)
+                               SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_ALL,
+                                       InvalidTransactionId, 0, InvalidXLogRecPtr, 0);
+               }
+
                 /* Initialize resource managers */
                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
                 {
@@ -5580,7 +6338,9 @@ StartupXLOG(void)
                         do
                         {
  #ifdef WAL_DEBUG
-                               if (XLOG_DEBUG)
+                               if (XLOG_DEBUG ||
+                                       (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
+                                       (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
                                 {
                                         StringInfoData buf;
  
@@ -5608,29 +6368,28 @@ StartupXLOG(void)
                                 }
  
                                 /*
-                                * Check if we were requested to exit without finishing
-                                * recovery.
-                                */
-                               if (shutdown_requested)
-                                       proc_exit(1);
-
-                               /*
-                                * Have we passed our safe starting point? If so, we can tell
-                                * postmaster that the database is consistent now.
+                                * Have we passed our safe starting point? 
                                  */
                                 if (!reachedMinRecoveryPoint &&
                                         XLByteLT(minRecoveryPoint, EndRecPtr))
                                 {
-                                       reachedMinRecoveryPoint = true;
-                                       if (InArchiveRecovery)
-                                       {
+                                               reachedMinRecoveryPoint = true;
                                                 ereport(LOG,
                                                           (errmsg("consistent recovery state reached")));
-                                               if (IsUnderPostmaster)
-                                                       SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
-                                       }
                                 }
  
+                               /*
+                                * Have we passed our safe starting point? Also, if InHotStandby,
+                                * have we got a valid starting snapshot that will allow
+                                * connections to succeed? If so, we can tell postmaster that
+                                * the database is consistent now, enabling connections.
+                                */
+                               if (reachedMinRecoveryPoint &&
+                                       InHotStandby &&
+                                       IsUnderPostmaster &&
+                                       IsRunningXactDataValid())
+                                                       SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+
                                 /*
                                  * Have we reached our recovery target?
                                  */
@@ -5671,6 +6430,8 @@ StartupXLOG(void)
  
                                 LastRec = ReadRecPtr;
  
+                               recoveryPausesAfterLSN();
+
                                 record = ReadRecord(NULL, LOG);
                         } while (record != NULL && recoveryContinue);
  
@@ -5892,6 +6653,7 @@ StartupXLOG(void)
         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
  
         /* Start up the commit log and related stuff, too */
+       /* XXXHS: perhaps this should go after XactClearRecoveryTransactions */
         StartupCLOG();
         StartupSUBTRANS(oldestActiveXID);
         StartupMultiXact();
@@ -5899,6 +6661,16 @@ StartupXLOG(void)
         /* Reload shared-memory state for prepared transactions */
         RecoverPreparedTransactions();
  
+       /* 
+        * Shutdown the recovery environment. This must occur after 
+        * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
+        */
+       if (InHotStandby)
+       {
+               EndCleanupDelayStats();
+               XactClearRecoveryTransactions();
+       }
+
         /* Shut down readFile facility, free space */
         if (readFile >= 0)
         {
@@ -5964,8 +6736,9 @@ RecoveryInProgress(void)
  
                 /*
                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
-                * is finished.  (If you change this, see also
-                * LocalSetXLogInsertAllowed.)
+                * is finished. InitPostgres() relies upon this behaviour to ensure
+                * that InitXLOGAccess() is called at backend startup.  (If you change
+                * this, see also LocalSetXLogInsertAllowed.)
                  */
                 if (!LocalRecoveryInProgress)
                         InitXLOGAccess();
@@ -6151,7 +6924,7 @@ InitXLOGAccess(void)
  {
         /* ThisTimeLineID doesn't change so we need no lock to copy it */
         ThisTimeLineID = XLogCtl->ThisTimeLineID;
-       Assert(ThisTimeLineID != 0);
+       Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
  
         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
         (void) GetRedoRecPtr();
@@ -6448,6 +7221,7 @@ CreateCheckPoint(int flags)
         /* Begin filling in the checkpoint WAL record */
         MemSet(&checkPoint, 0, sizeof(checkPoint));
         checkPoint.time = (pg_time_t) time(NULL);
+       checkPoint.MaxConnections = MaxConnections;
  
         /*
          * We must hold WALInsertLock while examining insert state to determine
@@ -6743,6 +7517,27 @@ CreateCheckPoint(int flags)
                                                                          CheckpointStats.ckpt_segs_recycled);
  
         LWLockRelease(CheckpointLock);
+
+       /*
+        * Take a snapshot of running transactions and write this to WAL.
+        * This allows us to reconstruct the state of running transactions
+        * during archive recovery, if required. If we aren't archiving,
+        * don't bother.
+        *
+        * If we are shutting down, or Startup process is completing crash
+        * recovery we don't need to write running xact data.
+        */
+       if (!shutdown && XLogArchivingActive() && !RecoveryInProgress())
+       {
+               /* 
+                * GetRunningTransactionData() inserts WAL records while holding
+                * ProcArrayLock. Make sure we flush WAL first so we reduce the
+                * chance of needing to flush WAL during XLogInsert(), which might
+                * mean we hold ProcArrayLock across an I/O, which could be bad.
+                */
+               XLogBackgroundFlush();
+               GetRunningTransactionData();
+       }
  }
  
  /*
@@ -6780,6 +7575,11 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
         /* use volatile pointer to prevent code rearrangement */
         volatile XLogCtlData *xlogctl = XLogCtl;
  
+       /*
+        * Regular reports of wait statistics. Unrelated to restartpoints.
+        */
+       ReportCleanupDelayStats();
+
         /*
          * Is it safe to checkpoint?  We must ask each of the resource managers
          * whether they have any partial state information that might prevent a
@@ -6791,7 +7591,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
                         {
-                               elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
+                               elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
                                          rmid,
                                          checkPoint->redo.xlogid,
                                          checkPoint->redo.xrecoff);
@@ -7015,6 +7815,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
         {
                 Oid                     nextOid;
  
+               if (InHotStandby)
+                       RecordKnownAssignedTransactionIds(record->xl_xid);
+
                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
                 if (ShmemVariableCache->nextOid < nextOid)
                 {
@@ -7036,6 +7839,15 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                 ShmemVariableCache->oldestXid = checkPoint.oldestXid;
                 ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
  
+               if (InHotStandby)
+               {
+                       /* We know nothing was running on the master at this point */
+                       XactClearRecoveryTransactions();
+
+                       /* Check to see if any changes to max_connections give problems */
+                       CheckMaxConnections(checkPoint.MaxConnections);
+               }
+
                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
@@ -7155,6 +7967,9 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
                                          record->xl_xid);
  
+       appendStringInfo(buf, "; len %u",
+                                        record->xl_len);
+
         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
         {
                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
@@ -7311,6 +8126,12 @@ pg_start_backup(PG_FUNCTION_ARGS)
                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                                  errmsg("must be superuser to run a backup")));
  
+       if (RecoveryInProgress())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
         if (!XLogArchivingActive())
                 ereport(ERROR,
                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7498,6 +8319,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                                  (errmsg("must be superuser to run a backup"))));
  
+       if (RecoveryInProgress())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
         if (!XLogArchivingActive())
                 ereport(ERROR,
                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7659,6 +8486,12 @@ pg_switch_xlog(PG_FUNCTION_ARGS)
                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                          (errmsg("must be superuser to switch transaction log files"))));
  
+       if (RecoveryInProgress())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
         switchpoint = RequestXLogSwitch();
  
         /*
@@ -7681,6 +8514,12 @@ pg_current_xlog_location(PG_FUNCTION_ARGS)
  {
         char            location[MAXFNAMELEN];
  
+       if (RecoveryInProgress())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
         /* Make sure we have an up-to-date local LogwrtResult */
         {
                 /* use volatile pointer to prevent code rearrangement */
@@ -7708,6 +8547,12 @@ pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
         XLogRecPtr      current_recptr;
         char            location[MAXFNAMELEN];
  
+       if (RecoveryInProgress())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("recovery is in progress"),
+                                errhint("WAL control functions cannot be executed during recovery.")));
+
         /*
          * Get the current end-of-WAL position ... shared lock is sufficient
          */
@@ -7964,7 +8809,7 @@ rm_redo_error_callback(void *arg)
  
         /* don't bother emitting empty description */
         if (buf.len > 0)
-               errcontext("xlog redo %s", buf.data);
+               errcontext("rmgr %u redo %s", record->xl_rmid, buf.data);
  
         pfree(buf.data);
  }
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c

index 11a0dc46f782dea87cdf47aa76b3a0d60ae88748..187f2ad9e4c005b10f691097addfa2374458fb7b 100644 (file)
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -404,6 +404,9 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
         /* Backup blocks are not used in smgr records */
         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
  
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
         if (info == XLOG_SMGR_CREATE)
         {
                 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c

index 7d4ca41d9136dde1af737846efa1c0fdd7e61d32..c64b67adad539e44ce25d793b9dca150d0e480e3 100644 (file)
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -26,6 +26,7 @@
  
  #include "access/genam.h"
  #include "access/heapam.h"
+#include "access/transam.h"
  #include "access/xact.h"
  #include "access/xlogutils.h"
  #include "catalog/catalog.h"
@@ -51,6 +52,7 @@
  #include "utils/builtins.h"
  #include "utils/fmgroids.h"
  #include "utils/guc.h"
+#include "utils/inval.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
  #include "utils/snapmgr.h"
@@ -1960,6 +1962,12 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
                 src_path = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id);
                 dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
  
+               /*
+                * No conflict resolution is required for a create database record
+                */
+               if (InHotStandby)
+                       RecordKnownAssignedTransactionIds(record->xl_xid);
+
                 /*
                  * Our theory for replaying a CREATE is to forcibly drop the target
                  * subdirectory if present, then re-copy the source data. This may be
@@ -1993,6 +2001,29 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
  
                 dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
  
+               if (InHotStandby)
+               {
+                       VirtualTransactionId *database_users;
+
+                       RecordKnownAssignedTransactionIds(record->xl_xid);
+
+                       /*
+                        * Find all users connected to this database and ask them
+                        * politely to immediately kill their sessions before processing
+                        * the drop database record, after the usual grace period.
+                        * We don't wait for commit because drop database is
+                        * non-transactional.
+                        */
+                   database_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+                                                                                                               xlrec->db_id,
+                                                                                                               false);
+
+                       ResolveRecoveryConflictWithVirtualXIDs(database_users,
+                                                                                                       "drop database",
+                                                                                                       CONFLICT_MODE_FATAL,
+                                                                                                       InvalidXLogRecPtr);
+               }
+
                 /* Drop pages for this database that are in the shared buffer cache */
                 DropDatabaseBuffers(xlrec->db_id);
  
diff --git a/src/backend/commands/discard.c b/src/backend/commands/discard.c

index 348e6e033f7ada92fd2506720a6fa09d5b10cb7f..613dbc12c38468c5fc8cb5d9f6fbb373c5812e30 100644 (file)
--- a/src/backend/commands/discard.c
+++ b/src/backend/commands/discard.c
@@ -65,7 +65,8 @@ DiscardAll(bool isTopLevel)
         ResetAllOptions();
         DropAllPreparedStatements();
         PortalHashTableDeleteAll();
-       Async_UnlistenAll();
+       if (!RecoveryInProgress())
+               Async_UnlistenAll();
         LockReleaseAll(USER_LOCKMETHOD, true);
         ResetPlanCache();
         ResetTempTableNamespace();
diff --git a/src/backend/commands/lockcmds.c b/src/backend/commands/lockcmds.c

index 1e5c92eefbc20df09da5b0b0ed3241bdf9314025..bc8ac5517cdfddb237fe1d082cdd0d0d7673eae5 100644 (file)
--- a/src/backend/commands/lockcmds.c
+++ b/src/backend/commands/lockcmds.c
@@ -47,6 +47,17 @@ LockTableCommand(LockStmt *lockstmt)
  
                 reloid = RangeVarGetRelid(relation, false);
  
+               /*
+                * During recovery we only accept these variations:
+                *
+                * LOCK TABLE foo       -- implicitly, AccessExclusiveLock
+                * LOCK TABLE foo IN ACCESS SHARE MODE
+                * LOCK TABLE foo IN ACCESS EXCLUSIVE MODE
+                */
+               if (lockstmt->mode != AccessShareLock
+                       && lockstmt->mode != AccessExclusiveLock)
+                       PreventCommandDuringRecovery();
+
                 LockTableRecurse(reloid, relation,
                                                  lockstmt->mode, lockstmt->nowait, recurse);
         }
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c

index 000c03775041605abec5d7f3b3f7609deccc5eaf..c3b572117297662018fd7a77b147d9d57e9fd0a4 100644 (file)
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -457,6 +457,9 @@ nextval_internal(Oid relid)
                                 rescnt = 0;
         bool            logit = false;
  
+       /* All nextval() write to database and must be prevented during recovery */
+       PreventCommandDuringRecovery();
+
         /* open and AccessShareLock sequence */
         init_sequence(relid, &elm, &seqrel);
  
@@ -1342,6 +1345,11 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record)
         /* Backup blocks are not used in seq records */
         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
  
+       if (InHotStandby)
+               RecordKnownAssignedTransactionIds(record->xl_xid);
+
+       RestoreBkpBlocks(lsn, record, false);
+
         if (info != XLOG_SEQ_LOG)
                 elog(PANIC, "seq_redo: unknown op code %u", info);
  
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c

index f119cf056afa324bcb6331e9bc067ce2ff93a2d0..f3b8ec56022f9d78eec9e3fe45ec91356203f95b 100644 (file)
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -50,6 +50,7 @@
  
  #include "access/heapam.h"
  #include "access/sysattr.h"
+#include "access/transam.h"
  #include "access/xact.h"
  #include "catalog/catalog.h"
  #include "catalog/dependency.h"
@@ -60,10 +61,12 @@
  #include "miscadmin.h"
  #include "postmaster/bgwriter.h"
  #include "storage/fd.h"
+#include "storage/procarray.h"
  #include "utils/acl.h"
  #include "utils/builtins.h"
  #include "utils/fmgroids.h"
  #include "utils/guc.h"
+#include "utils/inval.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
  #include "utils/rel.h"
@@ -1296,6 +1299,12 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record)
                 char       *location = xlrec->ts_path;
                 char       *linkloc;
  
+               /*
+                * No conflict resolution is required for a create tablespace record
+                */
+               if (InHotStandby)
+                       RecordKnownAssignedTransactionIds(record->xl_xid);
+
                 /*
                  * Attempt to coerce target directory to safe permissions.      If this
                  * fails, it doesn't exist or has the wrong owner.
@@ -1328,11 +1337,65 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record)
         {
                 xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
  
+               /*
+                * Process recovery transaction information
+                */
+               if (InHotStandby)
+                       RecordKnownAssignedTransactionIds(record->xl_xid);
+
+               /*
+                * If we issued a WAL record for a drop tablespace it is
+                * because there were no files in it at all. That means that
+                * no permanent objects can exist in it at this point.
+                *
+                * It is possible for standby users to be using this tablespace
+                * as a location for their temporary files, so if we fail to
+                * remove all files then do conflict processing and try again,
+                * if currently enabled.
+                */
                 if (!remove_tablespace_directories(xlrec->ts_id, true))
-                       ereport(ERROR,
+               {
+                       VirtualTransactionId *temp_file_users;
+
+                       /*
+                        * Standby users may be currently using this tablespace for
+                        * for their temporary files. We only care about current
+                        * users because temp_tablespace parameter will just ignore
+                        * tablespaces that no longer exist.
+                        *
+                        * Ask everybody to cancel their queries immediately so 
+                        * we can ensure no temp files remain and we can remove the
+                        * tablespace. Nuke the entire site from orbit, its the only
+                        * way to be sure.
+                        * 
+                        * XXX: We could work out the pids of active backends
+                        * using this tablespace by examining the temp filenames in the
+                        * directory. We would then convert the pids into VirtualXIDs
+                        * before attempting to cancel them.
+                        *
+                        * We don't wait for commit because drop tablespace is
+                        * non-transactional.
+                        */
+                       temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+                                                                                                               InvalidOid,
+                                                                                                               false);
+                       ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
+                                                                                                       "drop tablespace",
+                                                                                                       CONFLICT_MODE_ERROR_IF_NOT_IDLE,
+                                                                                                       InvalidXLogRecPtr);
+
+                       /*
+                        * If we did recovery processing then hopefully the
+                        * backends who wrote temp files should have cleaned up and
+                        * exited by now. So lets recheck before we throw an error.
+                        * If !process_conflicts then this will just fail again.
+                        */
+                       if (!remove_tablespace_directories(xlrec->ts_id, true))
+                               ereport(ERROR,
                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                          errmsg("tablespace %u is not empty",
-                                                       xlrec->ts_id)));
+                                                                       xlrec->ts_id)));
+               }
         }
         else
                 elog(PANIC, "tblspc_redo: unknown op code %u", info);
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c

index c9c9baad54b664406ba710052e8e61e9acaacc10..4fe3bfbf4128024f58f114f8444d481929dc82f3 100644 (file)
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -140,6 +140,7 @@ typedef struct VRelStats
         /* vtlinks array for tuple chain following - sorted by new_tid */
         int                     num_vtlinks;
         VTupleLink      vtlinks;
+       TransactionId   latestRemovedXid;
  } VRelStats;
  
  /*----------------------------------------------------------------------
@@ -223,7 +224,7 @@ static void scan_heap(VRelStats *vacrelstats, Relation onerel,
  static void repair_frag(VRelStats *vacrelstats, Relation onerel,
                         VacPageList vacuum_pages, VacPageList fraged_pages,
                         int nindexes, Relation *Irel);
-static void move_chain_tuple(Relation rel,
+static void move_chain_tuple(VRelStats *vacrelstats, Relation rel,
                                  Buffer old_buf, Page old_page, HeapTuple old_tup,
                                  Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
                                  ExecContext ec, ItemPointer ctid, bool cleanVpd);
@@ -236,7 +237,7 @@ static void update_hint_bits(Relation rel, VacPageList fraged_pages,
                                  int num_moved);
  static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
                         VacPageList vacpagelist);
-static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
+static void vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage);
  static void vacuum_index(VacPageList vacpagelist, Relation indrel,
                          double num_tuples, int keep_tuples);
  static void scan_index(Relation indrel, double num_tuples);
@@ -1268,6 +1269,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
         vacrelstats->rel_tuples = 0;
         vacrelstats->rel_indexed_tuples = 0;
         vacrelstats->hasindex = false;
+       vacrelstats->latestRemovedXid = InvalidTransactionId;
  
         /* scan the heap */
         vacuum_pages.num_pages = fraged_pages.num_pages = 0;
@@ -1671,6 +1673,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                         {
                                 ItemId          lpp;
  
+                               HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
+                                                                                       &vacrelstats->latestRemovedXid);
+
                                 /*
                                  * Here we are building a temporary copy of the page with dead
                                  * tuples removed.      Below we will apply
@@ -1984,7 +1989,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                 /* there are dead tuples on this page - clean them */
                                 Assert(!isempty);
                                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-                               vacuum_page(onerel, buf, last_vacuum_page);
+                               vacuum_page(vacrelstats, onerel, buf, last_vacuum_page);
                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
                         }
                         else
@@ -2473,7 +2478,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                         tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
                                         tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
  
-                                       move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
+                                       move_chain_tuple(vacrelstats, onerel, Cbuf, Cpage, &tuple,
                                                                          dst_buffer, dst_page, destvacpage,
                                                                          &ec, &Ctid, vtmove[ti].cleanVpd);
  
@@ -2559,7 +2564,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                 dst_page = BufferGetPage(dst_buffer);
                                 /* if this page was not used before - clean it */
                                 if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
-                                       vacuum_page(onerel, dst_buffer, dst_vacpage);
+                                       vacuum_page(vacrelstats, onerel, dst_buffer, dst_vacpage);
                         }
                         else
                                 LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
@@ -2708,7 +2713,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                  * already committed"!
                  */
                 ForceSyncCommit();
-               (void) RecordTransactionCommit();
+               (void) RecordTransactionCommit(true);
         }
  
         /*
@@ -2736,7 +2741,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
                         page = BufferGetPage(buf);
                         if (!PageIsEmpty(page))
-                               vacuum_page(onerel, buf, *curpage);
+                               vacuum_page(vacrelstats, onerel, buf, *curpage);
                         UnlockReleaseBuffer(buf);
                 }
         }
@@ -2872,7 +2877,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                 recptr = log_heap_clean(onerel, buf,
                                                                                 NULL, 0, NULL, 0,
                                                                                 unused, uncnt,
-                                                                               false);
+                                                                               vacrelstats->latestRemovedXid, false);
                                 PageSetLSN(page, recptr);
                                 PageSetTLI(page, ThisTimeLineID);
                         }
@@ -2922,7 +2927,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
   *             already too long and almost unreadable.
   */
  static void
-move_chain_tuple(Relation rel,
+move_chain_tuple(VRelStats *vacrelstats, Relation rel,
                                  Buffer old_buf, Page old_page, HeapTuple old_tup,
                                  Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
                                  ExecContext ec, ItemPointer ctid, bool cleanVpd)
@@ -2980,7 +2985,7 @@ move_chain_tuple(Relation rel,
                 int                     sv_offsets_used = dst_vacpage->offsets_used;
  
                 dst_vacpage->offsets_used = 0;
-               vacuum_page(rel, dst_buf, dst_vacpage);
+               vacuum_page(vacrelstats, rel, dst_buf, dst_vacpage);
                 dst_vacpage->offsets_used = sv_offsets_used;
         }
  
@@ -3320,7 +3325,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
                         buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno,
                                                                          RBM_NORMAL, vac_strategy);
                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-                       vacuum_page(onerel, buf, *vacpage);
+                       vacuum_page(vacrelstats, onerel, buf, *vacpage);
                         UnlockReleaseBuffer(buf);
                 }
         }
@@ -3350,7 +3355,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
   * Caller must hold pin and lock on buffer.
   */
  static void
-vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
+vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage)
  {
         Page            page = BufferGetPage(buffer);
         int                     i;
@@ -3379,7 +3384,7 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
                 recptr = log_heap_clean(onerel, buffer,
                                                                 NULL, 0, NULL, 0,
                                                                 vacpage->offsets, vacpage->offsets_free,
-                                                               false);
+                                                               vacrelstats->latestRemovedXid, false);
                 PageSetLSN(page, recptr);
                 PageSetTLI(page, ThisTimeLineID);
         }
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c

index be7737fad33ab73dbd6fdc97cbf1114012e29dcf..990635460168aa9b0437ae624dcb9730146339de 100644 (file)
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -98,6 +98,7 @@ typedef struct LVRelStats
         int                     max_dead_tuples;        /* # slots allocated in array */
         ItemPointer dead_tuples;        /* array of ItemPointerData */
         int                     num_index_scans;
+       TransactionId latestRemovedXid;
  } LVRelStats;
  
  
@@ -248,6 +249,34 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
                 *scanned_all = vacrelstats->scanned_all;
  }
  
+/*
+ * For Hot Standby we need to know the highest transaction id that will
+ * be removed by any change. VACUUM proceeds in a number of passes so
+ * we need to consider how each pass operates. The first phase runs
+ * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
+ * progresses - these will have a latestRemovedXid on each record.
+ * In some cases this removes all of the tuples to be removed, though 
+ * often we have dead tuples with index pointers so we must remember them
+ * for removal in phase 3. Index records for those rows are removed 
+ * in phase 2 and index blocks do not have MVCC information attached.
+ * So before we can allow removal of any index tuples we need to issue
+ * a WAL record containing the latestRemovedXid of rows that will be
+ * removed in phase three. This allows recovery queries to block at the
+ * correct place, i.e. before phase two, rather than during phase three
+ * which would be after the rows have become inaccessible.
+ */
+static void
+vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
+{
+       /*
+        * No need to log changes for temp tables, they do not contain
+        * data visible on the standby server.
+        */
+       if (rel->rd_istemp || !XLogArchivingActive())
+               return;
+
+       (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
+}
  
  /*
   *     lazy_scan_heap() -- scan an open heap relation
@@ -298,6 +327,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
         nblocks = RelationGetNumberOfBlocks(onerel);
         vacrelstats->rel_pages = nblocks;
         vacrelstats->nonempty_pages = 0;
+       vacrelstats->latestRemovedXid = InvalidTransactionId;
  
         lazy_space_alloc(vacrelstats, nblocks);
  
@@ -356,6 +386,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
                         vacrelstats->num_dead_tuples > 0)
                 {
+                       /* Log cleanup info before we touch indexes */
+                       vacuum_log_cleanup_info(onerel, vacrelstats);
+
                         /* Remove index entries */
                         for (i = 0; i < nindexes; i++)
                                 lazy_vacuum_index(Irel[i],
@@ -365,6 +398,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         lazy_vacuum_heap(onerel, vacrelstats);
                         /* Forget the now-vacuumed tuples, and press on */
                         vacrelstats->num_dead_tuples = 0;
+                       vacrelstats->latestRemovedXid = InvalidTransactionId;
                         vacrelstats->num_index_scans++;
                 }
  
@@ -596,6 +630,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         if (tupgone)
                         {
                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+                               HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
+                                                                                               &vacrelstats->latestRemovedXid);
                                 tups_vacuumed += 1;
                         }
                         else
@@ -644,6 +680,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
                         /* Forget the now-vacuumed tuples, and press on */
                         vacrelstats->num_dead_tuples = 0;
+                       vacrelstats->latestRemovedXid = InvalidTransactionId;
                         vacuumed_pages++;
                 }
  
@@ -707,6 +744,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
         /* XXX put a threshold on min number of tuples here? */
         if (vacrelstats->num_dead_tuples > 0)
         {
+               /* Log cleanup info before we touch indexes */
+               vacuum_log_cleanup_info(onerel, vacrelstats);
+
                 /* Remove index entries */
                 for (i = 0; i < nindexes; i++)
                         lazy_vacuum_index(Irel[i],
@@ -851,7 +891,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
                 recptr = log_heap_clean(onerel, buffer,
                                                                 NULL, 0, NULL, 0,
                                                                 unused, uncnt,
-                                                               false);
+                                                               vacrelstats->latestRemovedXid, false);
                 PageSetLSN(page, recptr);
                 PageSetTLI(page, ThisTimeLineID);
         }
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c

index e94181a8c12a6e086e72dfeb2807dd808c30edd6..3678ea41926c0c87f65de25408d59f0278d03dee 100644 (file)
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -245,8 +245,9 @@ static bool RecoveryError = false;          /* T if WAL recovery failed */
   * When archive recovery is finished, the startup process exits with exit
   * code 0 and we switch to PM_RUN state.
   *
- * Normal child backends can only be launched when we are in PM_RUN state.
- * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
+ * Normal child backends can only be launched when we are in PM_RUN or
+ * PM_RECOVERY_CONSISTENT state.  (We also allow launch of normal
+ * child backends in PM_WAIT_BACKUP state, but only for superusers.)
   * In other states we handle connection requests by launching "dead_end"
   * child processes, which will simply send the client an error message and
   * quit.  (We track these in the BackendList so that we can know when they
@@ -1868,7 +1869,7 @@ static enum CAC_state
  canAcceptConnections(void)
  {
         /*
-        * Can't start backends when in startup/shutdown/recovery state.
+        * Can't start backends when in startup/shutdown/inconsistent recovery state.
          *
          * In state PM_WAIT_BACKUP only superusers can connect (this must be
          * allowed so that a superuser can end online backup mode); we return
@@ -1882,9 +1883,11 @@ canAcceptConnections(void)
                         return CAC_SHUTDOWN;    /* shutdown is pending */
                 if (!FatalError &&
                         (pmState == PM_STARTUP ||
-                        pmState == PM_RECOVERY ||
-                        pmState == PM_RECOVERY_CONSISTENT))
+                        pmState == PM_RECOVERY))
                         return CAC_STARTUP; /* normal startup */
+               if (!FatalError &&
+                        pmState == PM_RECOVERY_CONSISTENT)
+                       return CAC_OK; /* connection OK during recovery */
                 return CAC_RECOVERY;    /* else must be crash recovery */
         }
  
@@ -4003,9 +4006,8 @@ sigusr1_handler(SIGNAL_ARGS)
                 Assert(PgStatPID == 0);
                 PgStatPID = pgstat_start();
  
-               /* XXX at this point we could accept read-only connections */
-               ereport(DEBUG1,
-                               (errmsg("database system is in consistent recovery mode")));
+               ereport(LOG,
+                                (errmsg("database system is ready to accept read only connections")));
  
                 pmState = PM_RECOVERY_CONSISTENT;
         }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index de28374b40507ac494c85a154cc0c39547d38409..1444f72b8ff7d608453a11b9482abf5835eaffce 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -33,6 +33,8 @@
  #include <sys/file.h>
  #include <unistd.h>
  
+#include "access/xact.h"
+#include "access/xlogdefs.h"
  #include "catalog/catalog.h"
  #include "miscadmin.h"
  #include "pg_trace.h"
@@ -78,7 +80,9 @@ static bool IsForInput;
  
  /* local state for LockBufferForCleanup */
  static volatile BufferDesc *PinCountWaitBuf = NULL;
-
+static long            CleanupWaitSecs = 0;
+static int             CleanupWaitUSecs = 0;
+static bool            CleanupWaitStats = false;
  
  static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
                                   ForkNumber forkNum, BlockNumber blockNum,
@@ -89,6 +93,7 @@ static void PinBuffer_Locked(volatile BufferDesc *buf);
  static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
  static void BufferSync(int flags);
  static int     SyncOneBuffer(int buf_id, bool skip_recently_used);
+static void CleanupDelayStats(TimestampTz start_ts, TimestampTz end_ts);
  static void WaitIO(volatile BufferDesc *buf);
  static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
@@ -2441,6 +2446,8 @@ LockBufferForCleanup(Buffer buffer)
  
         for (;;)
         {
+               TimestampTz     start_ts = 0;
+
                 /* Try to acquire lock */
                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
                 LockBufHdr(bufHdr);
@@ -2463,9 +2470,14 @@ LockBufferForCleanup(Buffer buffer)
                 PinCountWaitBuf = bufHdr;
                 UnlockBufHdr(bufHdr);
                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+               if (CleanupWaitStats)
+                       start_ts = GetCurrentTimestamp();
                 /* Wait to be signaled by UnpinBuffer() */
                 ProcWaitForSignal();
                 PinCountWaitBuf = NULL;
+               if (CleanupWaitStats)
+                       CleanupDelayStats(start_ts, GetCurrentTimestamp());
+
                 /* Loop back and try again */
         }
  }
@@ -2518,6 +2530,54 @@ ConditionalLockBufferForCleanup(Buffer buffer)
         return false;
  }
  
+/*
+ * On standby servers only the Startup process applies Cleanup. As a result
+ * a single buffer pin can be enough to effectively halt recovery for short
+ * periods. We need special instrumentation to monitor this so we can judge
+ * whether additional measures are required to control the negative effects.
+ */
+void
+StartCleanupDelayStats(void)
+{
+       CleanupWaitSecs = 0;
+       CleanupWaitUSecs = 0;
+       CleanupWaitStats = true;
+}
+
+void
+EndCleanupDelayStats(void)
+{
+       CleanupWaitStats = false;
+}
+
+/*
+ * Called by Startup process whenever we request restartpoint
+ */
+void
+ReportCleanupDelayStats(void)
+{
+       Assert(InRecovery);
+
+       ereport(DEBUG1, (errmsg("cleanup wait total=%ld.%03d s",
+                               CleanupWaitSecs, CleanupWaitUSecs / 1000)));
+}
+
+static void
+CleanupDelayStats(TimestampTz start_ts, TimestampTz end_ts)
+{
+       long                    wait_secs;
+       int                             wait_usecs;
+
+       TimestampDifference(start_ts, end_ts, &wait_secs, &wait_usecs);
+
+       CleanupWaitSecs +=wait_secs;
+       CleanupWaitUSecs +=wait_usecs;
+       if (CleanupWaitUSecs > 999999)
+       {
+               CleanupWaitSecs += 1;
+               CleanupWaitUSecs -= 1000000;
+       }
+}
  
  /*
   *     Functions for buffer I/O handling
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c

index d3b94e76fa6e7b4e02786e149a2a38f621fdeaa4..28d1cf0745ad3e5f8274ea54f528605083835312 100644 (file)
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -17,6 +17,20 @@
   * as are the myProcLocks lists.  They can be distinguished from regular
   * backend PGPROCs at need by checking for pid == 0.
   *
+ * During recovery, we also keep a list of XIDs representing transactions
+ * that are known to be running at current point in WAL recovery. This
+ * list is kept in the KnownAssignedXids array, and updated by watching
+ * the sequence of arriving xids. This is very important because if we leave
+ * those xids out of the snapshot then they will appear to be already complete.
+ * Later, when they have actually completed this could lead to confusion as to
+ * whether those xids are visible or not, blowing a huge hole in MVCC.
+ * We need 'em.
+ *
+ * It is theoretically possible for a FATAL error to explode before writing
+ * an abort record. This could tie up KnownAssignedXids indefinitely, so
+ * we prune the array when a valid list of running xids arrives. These quirks,
+ * if they do ever exist in reality will not effect the correctness of
+ * snapshots.
   *
   * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -33,28 +47,50 @@
  
  #include "access/subtrans.h"
  #include "access/transam.h"
-#include "access/xact.h"
+#include "access/xlog.h"
  #include "access/twophase.h"
  #include "miscadmin.h"
+#include "storage/proc.h"
  #include "storage/procarray.h"
+#include "storage/sinval.h"
+#include "utils/inval.h"
  #include "utils/snapmgr.h"
  
+static RunningXactsData        CurrentRunningXactsData;
  
  /* Our shared memory area */
  typedef struct ProcArrayStruct
  {
         int                     numProcs;               /* number of valid procs entries */
-       int                     maxProcs;               /* allocated size of procs array */
+       int                     maxProcs;                       /* allocated size of total procs array */
+
+       int                     maxKnownAssignedXids;   /* allocated size of known assigned xids */
+       /*
+        * Last subxid that overflowed KnownAssignedXids array. Similar to
+        * overflowing subxid cached in PGPROC entries.
+        */
+       TransactionId   lastOverflowedXid;
  
         /*
          * We declare procs[] as 1 entry because C wants a fixed-size array, but
          * actually it is maxProcs entries long.
          */
         PGPROC     *procs[1];           /* VARIABLE LENGTH ARRAY */
+
+       /* ARRAY OF KNOWN ASSIGNED TRANSACTION XIDs FOLLOWS */
  } ProcArrayStruct;
  
  static ProcArrayStruct *procArray;
  
+/*
+ * Bookkeeping for tracking emulated transactions in recovery
+ */
+static HTAB *KnownAssignedXidsHash;
+static TransactionId   latestObservedXid = InvalidTransactionId;
+static TransactionId   initLatestRunningXid;
+static bool                    initRunningXactData = false;
+static bool                    recoverySnapshotValid = false;
+static TransactionId   nextWraparoundCheckXid = InvalidTransactionId;
  
  #ifdef XIDCACHE_DEBUG
  
@@ -90,6 +126,18 @@ static void DisplayXidCache(void);
  #define xc_slow_answer_inc()           ((void) 0)
  #endif   /* XIDCACHE_DEBUG */
  
+/* Primitives for KnownAssignedXids array handling for standby */
+static Size KnownAssignedXidsShmemSize(int size);
+static void KnownAssignedXidsInit(int size);
+static int  KnownAssignedXidsGet(TransactionId *xarray, 
+                                                                                       TransactionId *xmin,
+                                                                                       TransactionId xmax,
+                                                                                       bool *overflow);
+static void KnownAssignedXidsAdd(TransactionId xid, int nsubxids,
+                                                                                       TransactionId *subxid);
+static void KnownAssignedXidsRemove(TransactionId xid, bool report_error);
+static void KnownAssignedXidsRemoveMany(TransactionId xid);
+static void KnownAssignedXidsDisplay(int trace_level);
  
  /*
   * Report shared-memory space needed by CreateSharedProcArray.
@@ -100,8 +148,14 @@ ProcArrayShmemSize(void)
         Size            size;
  
         size = offsetof(ProcArrayStruct, procs);
-       size = add_size(size, mul_size(sizeof(PGPROC *),
-                                                                add_size(MaxBackends, max_prepared_xacts)));
+
+       /* Normal processing - MyProc slots */
+#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts)
+       size = add_size(size, mul_size(sizeof(PGPROC *), PROCARRAY_MAXPROCS));
+
+       /* Recovery processing  - KnownAssignedXids */
+#define MAX_KNOWN_ASSIGNED_XIDS ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+       size = add_size(size, KnownAssignedXidsShmemSize(MAX_KNOWN_ASSIGNED_XIDS));
  
         return size;
  }
@@ -116,15 +170,34 @@ CreateSharedProcArray(void)
  
         /* Create or attach to the ProcArray shared structure */
         procArray = (ProcArrayStruct *)
-               ShmemInitStruct("Proc Array", ProcArrayShmemSize(), &found);
+               ShmemInitStruct("Proc Array", 
+                                                       mul_size(sizeof(PGPROC *), PROCARRAY_MAXPROCS), 
+                                                       &found);
+
+       /*
+        * XXX currently we don't know that we are InHotStandby until
+        * after we have created shared memory. If recovery.conf was read
+        * earlier then we would know whether to create this or not. We
+        * only need it if we startup in recovery. This is 26kB by default,
+        * plus hash table overhead, so don't worry too much. It's passive
+        * so there's less danger of it causing trouble when not in use.
+        */
+       KnownAssignedXidsInit(MAX_KNOWN_ASSIGNED_XIDS);
  
         if (!found)
         {
                 /*
                  * We're the first - initialize.
                  */
+               /* Normal processing */
                 procArray->numProcs = 0;
-               procArray->maxProcs = MaxBackends + max_prepared_xacts;
+               procArray->maxProcs = PROCARRAY_MAXPROCS;
+
+               /*
+                * If you change this, also change ProcArrayShmemSize()
+                */
+               procArray->maxKnownAssignedXids = MAX_KNOWN_ASSIGNED_XIDS;
+               procArray->lastOverflowedXid = InvalidTransactionId;
         }
  }
  
@@ -302,6 +375,8 @@ ProcArrayClearTransaction(PGPROC *proc)
         proc->xid = InvalidTransactionId;
         proc->lxid = InvalidLocalTransactionId;
         proc->xmin = InvalidTransactionId;
+       proc->recoveryConflictMode = 0;
+       proc->recoveryConflictLSN = InvalidXLogRecPtr;
  
         /* redundant, but just in case */
         proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
@@ -312,6 +387,194 @@ ProcArrayClearTransaction(PGPROC *proc)
         proc->subxids.overflowed = false;
  }
  
+static void
+ProcArrayDisplay(int trace_level)
+{
+       ProcArrayStruct *arrayP = procArray;
+       int                     index, i;
+       StringInfoData buf;
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               PGPROC  *proc = arrayP->procs[index];
+               TransactionId   xid = proc->xid;
+               int nsubxids = proc->subxids.nxids;
+
+               initStringInfo(&buf);
+               appendStringInfo(&buf, "procarray %d xid %d",
+                                                       index, xid);
+               if (nsubxids > 0)
+               {
+                       appendStringInfo(&buf, "nsubxids %u : ", nsubxids);
+
+                       for (i = 0;     i < nsubxids; i++)
+                               appendStringInfo(&buf, "%u ", proc->subxids.xids[i]);
+               }
+
+               elog(trace_level, "%s", buf.data);
+       }
+
+       KnownAssignedXidsDisplay(trace_level);
+}
+
+/*
+ * ProcArrayApplyRecoveryInfo -- apply recovery info about xids and locks
+ *
+ * Takes us through 3 states: Uninitialized (start), Init and Valid
+ * Normal case is to go all the way to valid straight away, though there
+ * are atypical cases where we need to take it in steps.
+ * 
+ * Use the data about running transactions on master to create the
+ * initial state of KnownAssignedXids. We also these records to regularly 
+ * prune KnownAssignedXids because
+ * we know it is possible that some transactions with FATAL errors do not
+ * write abort records, which could cause eventual overflow.
+ *
+ * Only used during recovery. Notice the signature is very similar to a
+ * _redo function and its difficult to decide exactly where this code should
+ * reside.
+ */
+void
+ProcArrayApplyRecoveryInfo(XLogRecPtr lsn, xl_xact_running_xacts *xlrec)
+{
+       int                             xid_index;      /* main loop */
+       TransactionId   *xids;
+       int                             nxids;
+       xl_rel_lock             *loggableLocks;
+
+       Assert(InHotStandby);
+
+       /*
+        * Remove stale transactions, if any.
+        */
+       ExpireOldKnownAssignedTransactionIds(xlrec->oldestRunningXid);
+
+       /*
+        * If our snapshot is already valid, nothing else to do...
+        */
+       if (recoverySnapshotValid)
+               return;
+
+       /*
+        * If our initial RunningXactData had an overflowed snapshot then we
+        * knew we were missing some subxids from our snapshot. We can use
+        * this data as an initial snapshot, but we cannot yet mark it valid.
+        * We know that the missing subxids are equal to or earlier than
+        * LatestRunningXid. After we initialise we continue to apply changes
+        * during recovery, so once the oldestRunningXid is later than the
+        * initLatestRunningXid we can now prove that we no longer have
+        * missing information and can mark the snapshot as valid.
+        */
+       if (initRunningXactData && !recoverySnapshotValid)
+       {
+               if (TransactionIdPrecedes(initLatestRunningXid, xlrec->oldestRunningXid))
+               {
+                       recoverySnapshotValid = true;
+                       elog(trace_recovery(DEBUG2), 
+                                       "running xact data now proven complete");
+                       elog(trace_recovery(DEBUG2), 
+                                       "recovery snapshots are now enabled");
+               }
+               return;
+       }
+
+       /*
+        * Can't initialise with an incomplete set of lock information
+        */
+       if (xlrec->lock_overflow)
+       {
+               elog(trace_recovery(DEBUG2), 
+                               "running xact data has incomplete lock data");
+               return;
+       }
+
+       /*
+        * OK, we need to initialise from the RunningXactData record
+        */
+       initRunningXactData = true;
+       latestObservedXid = initLatestRunningXid = xlrec->latestRunningXid;
+
+       /*
+        * If the snapshot overflowed, then we still initialise with what we
+        * know, but the recovery snapshot isn't fully valid yet because we
+        * know there are some subxids missing (ergo we don't know which ones)
+        */
+       if (!xlrec->subxid_overflow)
+               recoverySnapshotValid = true;
+       else
+               elog(trace_recovery(DEBUG2), 
+                               "running xact data has incomplete subtransaction data");
+
+       xids = palloc(sizeof(TransactionId) * (xlrec->xcnt + xlrec->subxcnt));
+       nxids = 0;
+
+       ProcArrayDisplay(trace_recovery(DEBUG3));
+
+       /*
+        * Scan through the incoming array of RunningXacts and collect xids.
+        * We don't use SubtransSetParent because it doesn't matter yet. If
+        * we aren't overflowed then all xids will fit in snapshot and so we
+        * don't need subtrans. If we later overflow, an xid assignment record
+        * will add xids to subtrans. If RunningXacts is overflowed then we
+        * don't have enough information to correctly update subtrans anyway.   
+        */
+       for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+       {
+               RunningXact             *rxact = (RunningXact *) xlrec->xrun;
+               TransactionId   xid = rxact[xid_index].xid;
+               TransactionId   *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+               int i;
+
+               xids[nxids++] = xid;
+               for(i = 0; i < rxact[xid_index].nsubxids; i++)
+                       xids[nxids++] = subxip[rxact[xid_index].subx_offset + i];
+
+       }
+
+       /*
+        * Nobody else is running yet, but take locks anyhow
+        */
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
+                                                         xlrec->latestCompletedXid))
+               ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+
+       /*
+        * Add our new xids into the array
+        */
+       KnownAssignedXidsAdd(InvalidTransactionId, nxids, xids);
+
+       /* Advance global latestCompletedXid while holding the lock */
+       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
+                                                         xlrec->latestCompletedXid))
+               ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+
+       ProcArrayDisplay(trace_recovery(DEBUG3));
+
+       LWLockRelease(ProcArrayLock);
+
+       /*
+        * Locks array is after the xids and subxids, as described in snapshot.h
+        */
+       loggableLocks = (xl_rel_lock *) &(xlrec->xrun[(xlrec->xcnt + xlrec->subxcnt)]);
+       relation_redo_locks(loggableLocks, xlrec->numLocks);
+
+       elog(trace_recovery(DEBUG2), 
+               "running transaction data initialized");
+       if (recoverySnapshotValid)
+               elog(trace_recovery(DEBUG2), 
+                       "recovery snapshots are now enabled");
+}
+
+/*
+ * Is the data available to allow valid snapshots?
+ */
+bool
+IsRunningXactDataValid(void)
+{
+       return recoverySnapshotValid;
+}
  
  /*
   * TransactionIdIsInProgress -- is given transaction running in some backend
@@ -590,6 +853,9 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
         TransactionId result;
         int                     index;
  
+       /* Cannot look for individual databases during recovery */
+       Assert(allDbs || !RecoveryInProgress());
+
         LWLockAcquire(ProcArrayLock, LW_SHARED);
  
         /*
@@ -656,7 +922,7 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
   * but since PGPROC has only a limited cache area for subxact XIDs, full
   * information may not be available.  If we find any overflowed subxid arrays,
   * we have to mark the snapshot's subxid data as overflowed, and extra work
- * will need to be done to determine what's running (see XidInMVCCSnapshot()
+ * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
   * in tqual.c).
   *
   * We also update the following backend-global variables:
@@ -698,7 +964,8 @@ GetSnapshotData(Snapshot snapshot)
         if (snapshot->xip == NULL)
         {
                 /*
-                * First call for this snapshot
+                * First call for this snapshot. Snapshot is same size whether
+                * or not we are in recovery, see later comments.
                  */
                 snapshot->xip = (TransactionId *)
                         malloc(arrayP->maxProcs * sizeof(TransactionId));
@@ -708,13 +975,15 @@ GetSnapshotData(Snapshot snapshot)
                                          errmsg("out of memory")));
                 Assert(snapshot->subxip == NULL);
                 snapshot->subxip = (TransactionId *)
-                       malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+                       malloc((arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS) * sizeof(TransactionId));
                 if (snapshot->subxip == NULL)
                         ereport(ERROR,
                                         (errcode(ERRCODE_OUT_OF_MEMORY),
                                          errmsg("out of memory")));
         }
  
+       snapshot->takenDuringRecovery = RecoveryInProgress();
+
         /*
          * It is sufficient to get shared lock on ProcArrayLock, even if we are
          * going to set MyProc->xmin.
@@ -763,6 +1032,7 @@ GetSnapshotData(Snapshot snapshot)
                  */
                 if (TransactionIdIsNormal(xid))
                 {
+                       Assert(!snapshot->takenDuringRecovery);
                         if (TransactionIdFollowsOrEquals(xid, xmax))
                                 continue;
                         if (proc != MyProc)
@@ -795,6 +1065,7 @@ GetSnapshotData(Snapshot snapshot)
  
                                 if (nxids > 0)
                                 {
+                                       Assert(!snapshot->takenDuringRecovery);
                                         memcpy(snapshot->subxip + subcount,
                                                    (void *) proc->subxids.xids,
                                                    nxids * sizeof(TransactionId));
@@ -804,6 +1075,47 @@ GetSnapshotData(Snapshot snapshot)
                 }
         }
  
+       /*
+        * If in recovery get any known assigned xids.
+        */
+       if (snapshot->takenDuringRecovery)
+       {
+               bool overflow = false;
+
+               Assert(count == 0);
+
+               /*
+                * We store all xids directly into subxip[]. Here's why: 
+                *
+                * In recovery we don't know which xids are top-level and which are
+                * subxacts, a design choice that greatly simplifies xid processing.
+                *
+                * It seems like we would want to try to put xids into xip[] only,
+                * but that is fairly small. We would either need to make that bigger
+                * or to increase the rate at which we WAL-log xid assignment;
+                * neither is an appealing choice.
+                *
+                * We could try to store xids into xip[] first and then into subxip[]
+                * if there are too many xids. That only works if the snapshot doesn't
+                * overflow because we do not search subxip[] in that case. A simpler
+                * way is to just store all xids in the subxact array because this
+                * is by far the bigger array. We just leave the xip array empty.
+                *
+                * Either way we need to change the way XidInMVCCSnapshot() works
+                * depending upon when the snapshot was taken, or change normal
+                * snapshot processing so it matches.
+                */
+               subcount = GetKnownAssignedTransactions(snapshot->subxip, 
+                                                                                               &xmin, xmax, &overflow);
+
+               /*
+                * See if we have removed any subxids from KnownAssignedXids that
+                * we might need to see. If so, mark snapshot overflowed.
+                */
+               if (overflow)
+                       subcount = -1;  /* overflowed */
+       }
+
         if (!TransactionIdIsValid(MyProc->xmin))
                 MyProc->xmin = TransactionXmin = xmin;
  
@@ -839,6 +1151,261 @@ GetSnapshotData(Snapshot snapshot)
         return snapshot;
  }
  
+/*
+ * GetRunningTransactionData -- returns information about running transactions.
+ *
+ * Similar to GetSnapshotData but returning more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes. We
+ * also keep track of which subtransactions go with each PGPROC. All of this
+ * looks very similar to GetSnapshotData, but we have more procs and more info
+ * about each proc.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ */
+void
+GetRunningTransactionData(void)
+{
+       ProcArrayStruct *arrayP = procArray;
+       static RunningTransactions CurrentRunningXacts = (RunningTransactions) &CurrentRunningXactsData;
+       RunningXact     *rxact;
+       TransactionId *subxip;
+       TransactionId latestRunningXid = InvalidTransactionId;
+       TransactionId latestCompletedXid;
+       TransactionId oldestRunningXid = InvalidTransactionId;
+       int                     index;
+       int                     count = 0;
+       int                     subcount = 0;
+       bool            suboverflowed = false;
+       int                     numHeldLocks = 0;
+       XLogRecPtr      recptr;
+
+       /*
+        * Allocating space for maxProcs xids is usually overkill; numProcs would
+        * be sufficient.  But it seems better to do the malloc while not holding
+        * the lock, so we can't look at numProcs.  Likewise, we allocate much
+        * more subxip storage than is probably needed.
+        *
+        * Should only be allocated for bgwriter, since only ever executed
+        * during checkpoints.
+        */
+       if (CurrentRunningXacts->xrun == NULL)
+       {
+               /*
+                * First call
+                */
+               CurrentRunningXacts->xrun = (RunningXact *)
+                       malloc(arrayP->maxProcs * sizeof(RunningXact));
+               if (CurrentRunningXacts->xrun == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+               Assert(CurrentRunningXacts->subxip == NULL);
+               CurrentRunningXacts->subxip = (TransactionId *)
+                       malloc((arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS) * sizeof(TransactionId));
+               if (CurrentRunningXacts->subxip == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+               Assert(CurrentRunningXacts->loggableLocks == NULL);
+
+#define REASONABLE_MAX_NUM_LOCKS       (2 * arrayP->maxProcs)
+               CurrentRunningXacts->loggableLocks = (xl_rel_lock *)
+                       malloc(REASONABLE_MAX_NUM_LOCKS * sizeof(xl_rel_lock));
+               if (CurrentRunningXacts->loggableLocks == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+       }
+
+       rxact = CurrentRunningXacts->xrun;
+       subxip = CurrentRunningXacts->subxip;
+
+       count = 0;
+       subcount = 0;
+       suboverflowed = false;
+       numHeldLocks = 0;
+
+       /*
+        * Ensure that no new WAL-loggable locks can be taken, nor
+        * can xids enter or leave the procarray while we obtain snapshot.
+        */
+       LWLockAcquire(RecoveryInfoLock, LW_EXCLUSIVE);
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+       LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+       latestCompletedXid = ShmemVariableCache->latestCompletedXid;
+
+       /*
+        * Spin over procArray checking xid, and subxids. Shared lock is enough
+        * because new transactions don't use locks at all, so LW_EXCLUSIVE
+        * wouldn't be enough to prevent them, so don't bother.
+        */
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               volatile PGPROC *proc = arrayP->procs[index];
+               TransactionId xid;
+               int                     nxids;
+
+               /* Fetch xid just once - see GetNewTransactionId */
+               xid = proc->xid;
+
+               /*
+                * We store all xids, even XIDs >= xmax and our own XID, if any.
+                * But we don't store transactions that don't have a TransactionId
+                * yet because they will not show as running on a standby server.
+                */
+               if (!TransactionIdIsValid(xid))
+                       continue;
+
+               rxact[count].xid = xid;
+
+               if (TransactionIdPrecedes(latestRunningXid, xid))
+                       latestRunningXid = xid;
+
+               if (!TransactionIdIsValid(oldestRunningXid) ||
+                       TransactionIdPrecedes(xid, oldestRunningXid))
+                       oldestRunningXid = xid;
+
+               numHeldLocks += proc->numHeldLocks;
+
+               /*
+                * Save subtransaction XIDs.
+                *
+                * The other backend can add more subxids concurrently, but cannot
+                * remove any.  Hence it's important to fetch nxids just once. Should
+                * be safe to use memcpy, though.  (We needn't worry about missing any
+                * xids added concurrently, because they must postdate xmax.)
+                *
+                * Again, our own XIDs *are* included in the snapshot.
+                */
+               nxids = proc->subxids.nxids;
+
+               if (nxids > 0)
+               {
+                       TransactionId *subxids = (TransactionId *) proc->subxids.xids;
+
+                       rxact[count].subx_offset = subcount;
+
+                       memcpy(subxip + subcount,
+                                  (void *) proc->subxids.xids,
+                                  nxids * sizeof(TransactionId));
+                       subcount += nxids;
+
+                       if (proc->subxids.overflowed)
+                       {
+                               rxact[count].overflowed = true;
+                               suboverflowed = true;
+                       }
+
+                       if (TransactionIdPrecedes(latestRunningXid, subxids[nxids - 1]))
+                               latestRunningXid = subxids[nxids - 1];
+               }
+               else
+               {
+                       rxact[count].subx_offset = 0;
+                       rxact[count].overflowed = false;
+               }
+
+               rxact[count].nsubxids = nxids;
+               count++;
+       }
+
+       /*
+        * When there are no transactions running, just use the value
+        * of the last completed transaction. No need to check
+        * ReadNewTransactionId().
+        */
+       if (count == 0)
+               latestRunningXid = latestCompletedXid;
+
+       CurrentRunningXacts->xcnt = count;
+       CurrentRunningXacts->subxcnt = subcount;
+       CurrentRunningXacts->subxid_overflow = suboverflowed;
+       CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+       CurrentRunningXacts->latestRunningXid = latestRunningXid;
+       CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+
+       /*
+        * Mark snapshot invalid if the information is either incomplete
+        * or would be too large to reasonably cope with on standby.
+        * We log it anyway for diagnostic purposes.
+        */
+       if (numHeldLocks >= REASONABLE_MAX_NUM_LOCKS)
+               CurrentRunningXacts->lock_overflow = true;
+       else
+       {
+               CurrentRunningXacts->lock_overflow = false;
+
+               /*
+                * If we have some loggable locks then go and get their details.
+                *
+                * We try to optimise this because loggable locks are rare and
+                * this is a potentially very lengthy operation, since there
+                * may be many locks to record and the lock table is partitioned.
+                * We ensure that all loggable locks must first acquire
+                * RecoveryInfoLock. Lock requestors use LW_SHARED mode so there
+                * is no additional contention for them in normal running, except
+                * against this function, which requests LW_EXCLUSIVE. That allows
+                * us to accurately keep track of the number of loggable locks on
+                * each proc, so we can count them as we scan the procarray and,
+                * most importantly, skip accessing the global lock tables if we can.
+                */
+               if (numHeldLocks > 0)
+                       CurrentRunningXacts->numLocks = 
+                               GetRunningTransactionLocks(CurrentRunningXacts->loggableLocks);
+       }
+
+       recptr = LogCurrentRunningXacts(CurrentRunningXacts);
+
+       LWLockRelease(XidGenLock);
+       LWLockRelease(ProcArrayLock);
+       LWLockRelease(RecoveryInfoLock);
+
+       if (CurrentRunningXacts->subxid_overflow || 
+               CurrentRunningXacts->lock_overflow)
+               ereport(trace_recovery(DEBUG2), 
+                               (errmsg("snapshot of %u running transactions overflowed (lsn %X/%X)",
+                                               CurrentRunningXacts->xcnt,
+                                               recptr.xlogid, recptr.xrecoff)));
+       else
+               ereport(trace_recovery(DEBUG2), 
+                               (errmsg("snapshot of %u running transactions "
+                                       "with %u subtransactions and %u locks (lsn %X/%X)",
+                                               CurrentRunningXacts->xcnt,
+                                               CurrentRunningXacts->subxcnt,
+                                               CurrentRunningXacts->numLocks,
+                                               recptr.xlogid, recptr.xrecoff)));
+}
+
+/*
+ * Increment number of locks held by this process.
+ * Caller must already hold RecoveryInfoLock
+ */
+void
+ProcArrayIncrementNumHeldLocks(PGPROC *proc)
+{
+       /* 
+        * XXX should we care about integer overflow? Lock table on default
+        * settings has 6400 slots and practical limits are much less than INT_MAX.
+        */
+       proc->numHeldLocks++;
+}
+
+/*
+ * Decrement number of locks held by this process.
+ * Caller must already hold RecoveryInfoLock
+ */
+void
+ProcArrayDecrementNumHeldLocks(PGPROC *proc)
+{
+       proc->numHeldLocks--;
+}
+
  /*
   * GetTransactionsInCommit -- Get the XIDs of transactions that are committing
   *
@@ -969,22 +1536,16 @@ BackendPidGetProc(int pid)
  }
  
  /*
- * BackendXidGetPid -- get a backend's pid given its XID
- *
- * Returns 0 if not found or it's a prepared transaction.  Note that
- * it is up to the caller to be sure that the question remains
- * meaningful for long enough for the answer to be used ...
- *
- * Only main transaction Ids are considered.  This function is mainly
- * useful for determining what backend owns a lock.
+ * BackendXidGetProc -- get a backend's PGPROC given its XID
   *
- * Beware that not every xact has an XID assigned.     However, as long as you
- * only call this using an XID found on disk, you're safe.
+ * Returns NULL if not found.  Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
   */
-int
-BackendXidGetPid(TransactionId xid)
+PGPROC *
+BackendXidGetProc(TransactionId xid)
  {
-       int                     result = 0;
+       PGPROC     *result = NULL;
         ProcArrayStruct *arrayP = procArray;
         int                     index;
  
@@ -995,11 +1556,11 @@ BackendXidGetPid(TransactionId xid)
  
         for (index = 0; index < arrayP->numProcs; index++)
         {
-               volatile PGPROC *proc = arrayP->procs[index];
+               PGPROC     *proc = arrayP->procs[index];
  
                 if (proc->xid == xid)
                 {
-                       result = proc->pid;
+                       result = proc;
                         break;
                 }
         }
@@ -1010,12 +1571,53 @@ BackendXidGetPid(TransactionId xid)
  }
  
  /*
- * IsBackendPid -- is a given pid a running backend
- */
-bool
-IsBackendPid(int pid)
-{
-       return (BackendPidGetProc(pid) != NULL);
+ * BackendXidGetPid -- get a backend's pid given its XID
+ *
+ * Returns 0 if not found or it's a prepared transaction.  Note that
+ * it is up to the caller to be sure that the question remains
+ * meaningful for long enough for the answer to be used ...
+ *
+ * Only main transaction Ids are considered.  This function is mainly
+ * useful for determining what backend owns a lock.
+ *
+ * Beware that not every xact has an XID assigned.     However, as long as you
+ * only call this using an XID found on disk, you're safe.
+ */
+int
+BackendXidGetPid(TransactionId xid)
+{
+       int                     result = 0;
+       ProcArrayStruct *arrayP = procArray;
+       int                     index;
+
+       if (xid == InvalidTransactionId)        /* never match invalid xid */
+               return 0;
+
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               volatile PGPROC *proc = arrayP->procs[index];
+
+               if (proc->xid == xid)
+               {
+                       result = proc->pid;
+                       break;
+               }
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       return result;
+}
+
+/*
+ * IsBackendPid -- is a given pid a running backend
+ */
+bool
+IsBackendPid(int pid)
+{
+       return (BackendPidGetProc(pid) != NULL);
  }
  
  
@@ -1101,6 +1703,121 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
         return vxids;
  }
  
+/*
+ * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd and is terminated with an invalid VXID.
+ *
+ * If limitXmin is not InvalidTransactionId, we skip any backends
+ * with xmin >= limitXmin.     If dbOid is valid we skip backends attached
+ * to other databases. Some callers choose to skipExistingConflicts.
+ *
+ * Be careful to *not* pfree the result from this function. We reuse
+ * this array sufficiently often that we use malloc for the result.
+ */
+VirtualTransactionId *
+GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid, 
+                                                       bool skipExistingConflicts)
+{
+       static VirtualTransactionId *vxids;
+       ProcArrayStruct *arrayP = procArray;
+       int                     count = 0;
+       int                     index;
+
+       /*
+        * If not first time through, get workspace to remember main XIDs in. We
+        * malloc it permanently to avoid repeated palloc/pfree overhead.
+        * Allow result space, remembering room for a terminator.
+        */
+       if (vxids == NULL)
+       {
+               vxids = (VirtualTransactionId *)
+                       malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+               if (vxids == NULL)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OUT_OF_MEMORY),
+                                        errmsg("out of memory")));
+       }
+
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               volatile PGPROC *proc = arrayP->procs[index];
+
+               /* Exclude prepared transactions */
+               if (proc->pid == 0)
+                       continue;
+
+               if (skipExistingConflicts && proc->recoveryConflictMode > 0)
+                       continue;
+
+               if (!OidIsValid(dbOid) ||
+                       proc->databaseId == dbOid)
+               {
+                       /* Fetch xmin just once - can't change on us, but good coding */
+                       TransactionId pxmin = proc->xmin;
+
+                       /*
+                        * If limitXmin is set we explicitly choose to ignore an invalid
+                        * pxmin because this means that backend has no snapshot and
+                        * cannot get another one while we hold exclusive lock.
+                        */
+                       if (!TransactionIdIsValid(limitXmin) ||
+                               (TransactionIdPrecedes(pxmin, limitXmin) && TransactionIdIsValid(pxmin)))
+                       {
+                               VirtualTransactionId vxid;
+
+                               GET_VXID_FROM_PGPROC(vxid, *proc);
+                               if (VirtualTransactionIdIsValid(vxid))
+                                       vxids[count++] = vxid;
+                       }
+               }
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       /* add the terminator */
+       vxids[count].backendId = InvalidBackendId;
+       vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+       return vxids;
+}
+
+/*
+ * VirtualTransactionIdGetProc - used in recovery conflict processing
+ */
+PGPROC *
+VirtualTransactionIdGetProc(VirtualTransactionId vxid)
+{
+       ProcArrayStruct *arrayP = procArray;
+       PGPROC          *result = NULL;
+       int                     index;
+
+       if (!VirtualTransactionIdIsValid(vxid))
+               return NULL;
+
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+       for (index = 0; index < arrayP->numProcs; index++)
+       {
+               VirtualTransactionId procvxid;
+               PGPROC     *proc = arrayP->procs[index];
+
+               GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+               if (procvxid.backendId == vxid.backendId &&
+                       procvxid.localTransactionId == vxid.localTransactionId)
+               {
+                       result = proc;
+                       break;
+               }
+       }
+
+       LWLockRelease(ProcArrayLock);
+
+       return result;
+}
  
  /*
   * CountActiveBackends --- count backends (other than myself) that are in
@@ -1240,6 +1957,9 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
         int                     autovac_pids[MAXAUTOVACPIDS];
         int                     tries;
  
+       /* Gives wrong answer in recovery, so make sure we don't use it */
+       Assert(!RecoveryInProgress());
+
         /* 50 tries with 100ms sleep between tries makes 5 sec total wait */
         for (tries = 0; tries < 50; tries++)
         {
@@ -1400,3 +2120,459 @@ DisplayXidCache(void)
  }
  
  #endif   /* XIDCACHE_DEBUG */
+
+/* ----------------------------------------------
+ *             KnownAssignedTransactions sub-module
+ * ----------------------------------------------
+ */
+
+/*
+ * During recovery we maintain ProcArray according to the xid field
+ * on incoming WAL records. Returns true if we should perform conflict
+ * processing on the WAL record.
+ *
+ * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
+ * type apart from XLOG_XACT_RUNNING_XACTS, since that initialises the
+ * first snapshot so that RecordKnownAssignedTransactionIds() can be
+ * called. We don't currently check that rmgrs have called us.
+ * Uses local variables, so should only be called by Startup process.
+ *
+ * We record all xids that we know have been assigned. That includes
+ * all the xids on the WAL record, plus all unobserved xids that
+ * we can deduce have been assigned. We can deduce the existence of
+ * unobserved xids because we know xids are in sequence, with no gaps.
+ * 
+ * During recovery we do not fret too much about the distinction between
+ * top-level xids and subtransaction xids. We hold both together in
+ * a sorted data structure called KnownAssignedXids. This is copied
+ * directly by other backends during GetSnapshotData(), taking advantage
+ * of the fact that XidInMVCCSnapshot() doesn't care about the distinction
+ * either. Subtransaction xids are effectively treated as top-level xids
+ * and in the typical case pg_subtrans is *not* maintained (and that
+ * does not effect visibility).
+ *
+ * KnownAssignedXids expands as new xids are observed or inferred, and
+ * contracts when transaction completion records arrive. We have room in a
+ * snapshot to hold maxProcs * (1 + PGPROC_MAX_CACHED_SUBXIDS) xids, so
+ * every transaction must report their subtransaction xids in a special
+ * WAL assignment record every PGPROC_MAX_CACHED_SUBXIDS. This allows us
+ * to remove the subtransaction xids and update pg_subtrans instead. Snapshots
+ * are still correct yet we don't overflow SnapshotData structure. It is
+ * important that the XLOG_XACT_ASSIGNMENT record contain *all* subxids
+ * not just those so far unreported because the sole purpose is to ensure
+ * we can remove the xids from KnownAssignedXids. When we do this we need
+ * to keep track of which xids caused the snapshot to overflow. We do that
+ * by simply tracking the lastOverflowedXid - if it is within the bounds of
+ * the KnownAssignedXids then we know the snapshot overflowed. (Note that
+ * subxid overflow occurs on primary when 65th subxid arrives, whereas on
+ * standby it occurs when 64th subxid arrives - that is not an error).
+ *
+ * Should FATAL errors result in a backend on primary disappearing before
+ * it can write an abort record then we just leave those xids in
+ * KnownAssignedXids. They actually aborted but we think they were running;
+ * the distinction is irrelevant because we cannot see data for those xids.
+ * We prune KnownAssignedXids when XLOG_XACT_RUNNING_XACTS arrives, to
+ * ensure we do not overflow.
+ *
+ * If we are in state initRunningXactData, but not yet recoverySnapshotValid
+ * then we may try to remove xids that are not present
+ */
+void
+RecordKnownAssignedTransactionIds(TransactionId xid)
+{
+       /*
+        * Skip processing if the current snapshot is not initialized.
+        */
+       if (!initRunningXactData)
+               return;
+
+       ereport(trace_recovery(DEBUG4),
+                               (errmsg("record known xact %u latestObservedXid %u",
+                                                       xid, latestObservedXid)));
+
+       /*
+        * VACUUM records are always sent with InvalidTransactionId
+        */
+       if (!TransactionIdIsValid(xid))
+               return;
+
+       /*
+        * Check for risk of transaction wraparound. As new xids arrive
+        * on the standby it is eventually possible for a long lived query
+        * to find that the snapshot xmin is older than one xid epoch, which
+        * would make newly arrived data suddenly appear as if it were very old.
+        * We must cancel queries before we wraparound. We do conflict
+        * processing here to cover that possibility, though note that
+        * we may also perform conflict processing again for a different
+        * reason specific to the type of WAL record, covered in the rmgrs.
+        */
+       if (TransactionIdFollowsOrEquals(xid, nextWraparoundCheckXid))
+       {
+               TransactionId xidWrapLimit;
+               TransactionId xidStopLimit;
+               VirtualTransactionId *old_queries;
+
+               /* 
+                * Only need to check occasionally. nextWraparoundCheckXid is
+                * initialised on first use, since it starts at InvalidTransactionId
+                */
+               nextWraparoundCheckXid = xid + 65536;
+               if (nextWraparoundCheckXid < FirstNormalTransactionId)
+                       nextWraparoundCheckXid += FirstNormalTransactionId;
+
+               /*
+                * The place where we actually get into deep trouble is halfway around
+                * from the oldest xmin.  (This calculation is
+                * probably off by one or two counts, because the special XIDs reduce the
+                * size of the loop a little bit.  But we throw in plenty of slop below,
+                * so it doesn't matter.)
+                */
+               xidWrapLimit = xid + (MaxTransactionId >> 1);
+               if (xidWrapLimit < FirstNormalTransactionId)
+                       xidWrapLimit += FirstNormalTransactionId;
+
+               /*
+                * We'll refuse to allow queries to execute once we get
+                * within 1M transactions of data loss.  This leaves lots of room for the
+                * DBA to fool around fixing things in a standalone backend, while not
+                * being significant compared to total XID space. (Note that since
+                * vacuuming requires one transaction per table cleaned, we had better be
+                * sure there's lots of XIDs left...)
+                */
+               xidStopLimit = xidWrapLimit - 1000000;
+               if (xidStopLimit < FirstNormalTransactionId)
+                       xidStopLimit -= FirstNormalTransactionId;
+
+               old_queries = GetConflictingVirtualXIDs(xid, InvalidOid, false);
+               ResolveRecoveryConflictWithVirtualXIDs(old_queries,
+                                                                                               "xid anti-wraparound check",
+                                                                                               CONFLICT_MODE_ERROR,
+                                                                                               InvalidXLogRecPtr);
+       }
+
+       /*
+        * When a newly observed xid arrives, it is frequently the case
+        * that it is *not* the next xid in sequence. When this occurs, we
+        * must treat the intervening xids as running also.
+        */
+       if (TransactionIdFollows(xid, latestObservedXid))
+       {
+               TransactionId   next_expected_xid = latestObservedXid;
+               TransactionIdAdvance(next_expected_xid);
+
+               /*
+                * Locking requirement is currently higher than for xid assignment
+                * in normal running. However, we only get called here for new 
+                * high xids - so on a multi-processor where it is common that xids
+                * arrive out of order the average number of locks per assignment
+                * will actually reduce. So not too worried about this locking.
+                *
+                * XXX It does seem possible that we could add a whole range
+                * of numbers atomically to KnownAssignedXids, if we use a sorted
+                * list for KnownAssignedXids. But that design also increases the
+                * length of time we hold lock when we process commits/aborts, so
+                * on balance don't worry about this. 
+                */
+               LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+               while (TransactionIdPrecedesOrEquals(next_expected_xid, xid))
+               {
+                       if (TransactionIdPrecedes(next_expected_xid, xid))
+                               ereport(trace_recovery(DEBUG4),
+                                               (errmsg("recording unobserved xid %u (latestObservedXid %u)",
+                                                                       next_expected_xid, latestObservedXid)));
+                       KnownAssignedXidsAdd(next_expected_xid, 0, NULL);
+                       TransactionIdAdvance(next_expected_xid);
+               }
+
+               LWLockRelease(ProcArrayLock);
+
+               latestObservedXid = xid;
+       }
+}
+
+void
+ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
+                                                                TransactionId *subxids, TransactionId max_xid, bool overflow)
+{
+       int                     i;
+
+       if (!initRunningXactData)
+               return;
+
+       /*
+        * Uses same locking as transaction commit
+        */
+       LWLockAcquire(ProcArrayLock,&
author	Simon Riggs <[email protected]>
	Fri, 25 Sep 2009 11:30:39 +0000 (12:30 +0100)
committer	Simon Riggs <[email protected]>
	Fri, 25 Sep 2009 11:30:39 +0000 (12:30 +0100)
doc/src/sgml/backup.sgml		patch \| blob \| blame \| history
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
doc/src/sgml/func.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/checkpoint.sgml		patch \| blob \| blame \| history
src/backend/access/gin/ginxlog.c		patch \| blob \| blame \| history
src/backend/access/gist/gistxlog.c		patch \| blob \| blame \| history
src/backend/access/hash/hash.c		patch \| blob \| blame \| history
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/backend/access/heap/pruneheap.c		patch \| blob \| blame \| history
src/backend/access/index/genam.c		patch \| blob \| blame \| history
src/backend/access/index/indexam.c		patch \| blob \| blame \| history
src/backend/access/nbtree/README		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtinsert.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtpage.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtree.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtxlog.c		patch \| blob \| blame \| history
src/backend/access/transam/README		patch \| blob \| blame \| history
src/backend/access/transam/clog.c		patch \| blob \| blame \| history
src/backend/access/transam/multixact.c		patch \| blob \| blame \| history
src/backend/access/transam/rmgr.c		patch \| blob \| blame \| history
src/backend/access/transam/slru.c		patch \| blob \| blame \| history
src/backend/access/transam/subtrans.c		patch \| blob \| blame \| history
src/backend/access/transam/transam.c		patch \| blob \| blame \| history
src/backend/access/transam/twophase.c		patch \| blob \| blame \| history
src/backend/access/transam/twophase_rmgr.c		patch \| blob \| blame \| history
src/backend/access/transam/xact.c		patch \| blob \| blame \| history
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/catalog/storage.c		patch \| blob \| blame \| history
src/backend/commands/dbcommands.c		patch \| blob \| blame \| history
src/backend/commands/discard.c		patch \| blob \| blame \| history
src/backend/commands/lockcmds.c		patch \| blob \| blame \| history
src/backend/commands/sequence.c		patch \| blob \| blame \| history
src/backend/commands/tablespace.c		patch \| blob \| blame \| history
src/backend/commands/vacuum.c		patch \| blob \| blame \| history
src/backend/commands/vacuumlazy.c		patch \| blob \| blame \| history
src/backend/postmaster/postmaster.c		patch \| blob \| blame \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| blame \| history
src/backend/storage/ipc/procarray.c		patch \| blob \| blame \| history
src/backend/storage/ipc/sinvaladt.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/lock.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/proc.c		patch \| blob \| blame \| history
src/backend/tcop/postgres.c		patch \| blob \| blame \| history
src/backend/tcop/utility.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/backend/utils/adt/txid.c		patch \| blob \| blame \| history
src/backend/utils/cache/inval.c		patch \| blob \| blame \| history
src/backend/utils/error/elog.c		patch \| blob \| blame \| history
src/backend/utils/init/postinit.c		patch \| blob \| blame \| history
src/backend/utils/misc/guc.c		patch \| blob \| blame \| history
src/backend/utils/time/tqual.c		patch \| blob \| blame \| history
src/include/access/heapam.h		patch \| blob \| blame \| history
src/include/access/htup.h		patch \| blob \| blame \| history
src/include/access/nbtree.h		patch \| blob \| blame \| history
src/include/access/relscan.h		patch \| blob \| blame \| history
src/include/access/rmgr.h		patch \| blob \| blame \| history
src/include/access/subtrans.h		patch \| blob \| blame \| history
src/include/access/transam.h		patch \| blob \| blame \| history
src/include/access/twophase.h		patch \| blob \| blame \| history
src/include/access/twophase_rmgr.h		patch \| blob \| blame \| history
src/include/access/xact.h		patch \| blob \| blame \| history
src/include/access/xlog.h		patch \| blob \| blame \| history
src/include/access/xlog_internal.h		patch \| blob \| blame \| history
src/include/access/xlogutils.h		patch \| blob \| blame \| history
src/include/catalog/pg_control.h		patch \| blob \| blame \| history
src/include/catalog/pg_proc.h		patch \| blob \| blame \| history
src/include/miscadmin.h		patch \| blob \| blame \| history
src/include/storage/bufmgr.h		patch \| blob \| blame \| history
src/include/storage/lock.h		patch \| blob \| blame \| history
src/include/storage/lwlock.h		patch \| blob \| blame \| history
src/include/storage/proc.h		patch \| blob \| blame \| history
src/include/storage/procarray.h		patch \| blob \| blame \| history
src/include/storage/sinval.h		patch \| blob \| blame \| history
src/include/storage/sinvaladt.h		patch \| blob \| blame \| history
src/include/utils/inval.h		patch \| blob \| blame \| history
src/include/utils/selfuncs.h		patch \| blob \| blame \| history
src/include/utils/snapshot.h		patch \| blob \| blame \| history