Introduce WAL records to log reuse of btree pages, allowing conflict
authorSimon Riggs <[email protected]>
Sat, 13 Feb 2010 00:59:58 +0000 (00:59 +0000)
committerSimon Riggs <[email protected]>
Sat, 13 Feb 2010 00:59:58 +0000 (00:59 +0000)
resolution during Hot Standby. Page reuse interlock requested by Tom.
Analysis and patch by me.

src/backend/access/nbtree/nbtpage.c
src/backend/access/nbtree/nbtxlog.c
src/include/access/nbtree.h

index 89c15c5557596a7f6a8d00b248ea47c47a0185ed..7493c733b72b1626fb92a917808574df540163ad 100644 (file)
@@ -446,6 +446,48 @@ _bt_checkpage(Relation rel, Buffer buf)
                 errhint("Please REINDEX it.")));
 }
 
+/*
+ * Log the reuse of a page from the FSM.
+ */
+static void
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+{
+   if (rel->rd_istemp)
+       return;
+
+   /* No ereport(ERROR) until changes are logged */
+   START_CRIT_SECTION();
+
+   /*
+    * We don't do MarkBufferDirty here because we're about initialise
+    * the page, and nobody else can see it yet.
+    */
+
+   /* XLOG stuff */
+   {
+       XLogRecPtr  recptr;
+       XLogRecData rdata[1];
+       xl_btree_reuse_page xlrec_reuse;
+
+       xlrec_reuse.node = rel->rd_node;
+       xlrec_reuse.block = blkno;
+       xlrec_reuse.latestRemovedXid = latestRemovedXid;
+       rdata[0].data = (char *) &xlrec_reuse;
+       rdata[0].len = SizeOfBtreeReusePage;
+       rdata[0].buffer = InvalidBuffer;
+       rdata[0].next = NULL;
+
+       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata);
+
+       /*
+        * We don't do PageSetLSN or PageSetTLI here because
+        * we're about initialise the page, so no need.
+        */
+   }
+
+   END_CRIT_SECTION();
+}
+
 /*
  * _bt_getbuf() -- Get a buffer by block number for read or write.
  *
@@ -510,7 +552,19 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
            {
                page = BufferGetPage(buf);
                if (_bt_page_recyclable(page))
-               {
+               {                   
+                   /*
+                    * If we are generating WAL for Hot Standby then create
+                    * a WAL record that will allow us to conflict with
+                    * queries running on standby.
+                    */
+                   if (XLogStandbyInfoActive())
+                   {
+                       BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+                       _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
+                   }
+
                    /* Okay to use page.  Re-initialize and return it */
                    _bt_pageinit(page, BufferGetPageSize(buf));
                    return buf;
index 98158fcd68390ff3615c3b507dbd3e4d98e21551..1bc9a8364a4879192278b50f13ee8454184be511 100644 (file)
@@ -814,26 +814,48 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 {
    uint8       info = record->xl_info & ~XLR_INFO_MASK;
 
-   /*
-    * Btree delete records can conflict with standby queries. You might
-    * think that vacuum records would conflict as well, but we've handled
-    * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
-    * cleaned by the vacuum of the heap and so we can resolve any conflicts
-    * just once when that arrives. After that any we know that no conflicts
-    * exist from individual btree vacuum records on that index.
-    */
-   if (InHotStandby && info == XLOG_BTREE_DELETE)
+   if (InHotStandby)
    {
-       xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+       switch (info)
+       {
+           case XLOG_BTREE_DELETE:
+               /*
+                * Btree delete records can conflict with standby queries. You might
+                * think that vacuum records would conflict as well, but we've handled
+                * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+                * cleaned by the vacuum of the heap and so we can resolve any conflicts
+                * just once when that arrives. After that any we know that no conflicts
+                * exist from individual btree vacuum records on that index.
+                */
+               {
+                   xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
 
-       /*
-        * XXX Currently we put everybody on death row, because
-        * currently _bt_delitems() supplies InvalidTransactionId.
-        * This can be fairly painful, so providing a better value
-        * here is worth some thought and possibly some effort to
-        * improve.
-        */
-       ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+                   /*
+                    * XXX Currently we put everybody on death row, because
+                    * currently _bt_delitems() supplies InvalidTransactionId.
+                    * This can be fairly painful, so providing a better value
+                    * here is worth some thought and possibly some effort to
+                    * improve.
+                    */
+                   ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+               }
+               break;
+
+           case XLOG_BTREE_REUSE_PAGE:
+               /*
+                * Btree reuse page records exist to provide a conflict point when we
+                * reuse pages in the index via the FSM. That's all it does though.
+                */
+               {
+                   xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
+
+                   ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+               }
+               return;
+
+           default:
+               break;
+       }
    }
 
    /*
index 0583c912b35d434205afdd91dde070f062818ec8..d46046d4685f719bc8f63be163011310466371d2 100644 (file)
@@ -221,6 +221,7 @@ typedef struct BTMetaPageData
 #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0       /* page deletion that makes
                                                 * parent half-dead */
 #define XLOG_BTREE_VACUUM      0xC0    /* delete entries on a page during vacuum */
+#define XLOG_BTREE_REUSE_PAGE  0xD0    /* old page is about to be reused from FSM */
 
 /*
  * All that we need to find changed index tuple
@@ -321,6 +322,18 @@ typedef struct xl_btree_delete
 
 #define SizeOfBtreeDelete  (offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId))
 
+/*
+ * This is what we need to know about page reuse within btree.
+ */
+typedef struct xl_btree_reuse_page
+{
+   RelFileNode node;
+   BlockNumber block;
+   TransactionId   latestRemovedXid;
+} xl_btree_reuse_page;
+
+#define SizeOfBtreeReusePage   (sizeof(xl_btree_reuse_page))
+
 /*
  * This is what we need to know about vacuum of individual leaf index tuples.
  * The WAL record can represent deletion of any number of index tuples on a