Back-patch fixes for problems with VACUUM destroying t_ctid chains too soon,
authorTom Lane <[email protected]>
Fri, 26 Aug 2005 20:07:17 +0000 (20:07 +0000)
committerTom Lane <[email protected]>
Fri, 26 Aug 2005 20:07:17 +0000 (20:07 +0000)
and with insufficient paranoia in code that follows t_ctid links.
This patch covers the 7.3 branch.

src/backend/access/heap/heapam.c
src/backend/commands/trigger.c
src/backend/commands/vacuum.c
src/backend/executor/execMain.c
src/backend/utils/time/tqual.c
src/include/access/heapam.h
src/include/executor/executor.h

index b4b20c679b3202d1cd5eac204e97242c651f639a..f12a76f49774a75c12d42a556efb8eee91793f8a 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.149.2.1 2004/10/13 22:22:21 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.149.2.2 2005/08/26 20:07:15 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -1014,89 +1014,136 @@ heap_fetch(Relation relation,
 
 /*
  * heap_get_latest_tid -  get the latest tid of a specified tuple
+ *
+ * Actually, this gets the latest version that is visible according to
+ * the passed snapshot.  You can pass SnapshotDirty to get the very latest,
+ * possibly uncommitted version.
+ *
+ * *tid is both an input and an output parameter: it is updated to
+ * show the latest version of the row.  Note that it will not be changed
+ * if no version of the row passes the snapshot test.
  */
-ItemPointer
+void
 heap_get_latest_tid(Relation relation,
                    Snapshot snapshot,
                    ItemPointer tid)
 {
-   ItemId      lp = NULL;
-   Buffer      buffer;
-   PageHeader  dp;
-   OffsetNumber offnum;
-   HeapTupleData tp;
-   HeapTupleHeader t_data;
+   BlockNumber blk;
    ItemPointerData ctid;
-   bool        invalidBlock,
-               linkend,
-               valid;
+   TransactionId priorXmax;
+
+   /* this is to avoid Assert failures on bad input */
+   if (!ItemPointerIsValid(tid))
+       return;
 
    /*
-    * get the buffer from the relation descriptor Note that this does a
-    * buffer pin.
+    * Since this can be called with user-supplied TID, don't trust the
+    * input too much.  (RelationGetNumberOfBlocks is an expensive check,
+    * so we don't check t_ctid links again this way.  Note that it would
+    * not do to call it just once and save the result, either.)
     */
-
-   buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
-
-   if (!BufferIsValid(buffer))
-       elog(ERROR, "heap_get_latest_tid: %s relation: ReadBuffer(%lx) failed",
-            RelationGetRelationName(relation), (long) tid);
-
-   LockBuffer(buffer, BUFFER_LOCK_SHARE);
+   blk = ItemPointerGetBlockNumber(tid);
+   if (blk >= RelationGetNumberOfBlocks(relation))
+       elog(ERROR, "block number %u is out of range for relation \"%s\"",
+            blk, RelationGetRelationName(relation));
 
    /*
-    * get the item line pointer corresponding to the requested tid
+    * Loop to chase down t_ctid links.  At top of loop, ctid is the
+    * tuple we need to examine, and *tid is the TID we will return if
+    * ctid turns out to be bogus.
+    *
+    * Note that we will loop until we reach the end of the t_ctid chain.
+    * Depending on the snapshot passed, there might be at most one visible
+    * version of the row, but we don't try to optimize for that.
     */
-   dp = (PageHeader) BufferGetPage(buffer);
-   offnum = ItemPointerGetOffsetNumber(tid);
-   invalidBlock = true;
-   if (!PageIsNew(dp))
-   {
-       lp = PageGetItemId(dp, offnum);
-       if (ItemIdIsUsed(lp))
-           invalidBlock = false;
-   }
-   if (invalidBlock)
+   ctid = *tid;
+   priorXmax = InvalidTransactionId;   /* cannot check first XMIN */
+   for (;;)
    {
-       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-       ReleaseBuffer(buffer);
-       return NULL;
-   }
+       Buffer      buffer;
+       PageHeader  dp;
+       OffsetNumber offnum;
+       ItemId      lp;
+       HeapTupleData tp;
+       bool        valid;
 
-   /*
-    * more sanity checks
-    */
+       /*
+        * Read, pin, and lock the page.
+        */
+       buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
 
-   tp.t_datamcxt = NULL;
-   t_data = tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
-   tp.t_len = ItemIdGetLength(lp);
-   tp.t_self = *tid;
-   ctid = tp.t_data->t_ctid;
+       if (!BufferIsValid(buffer))
+           elog(ERROR, "ReadBuffer(\"%s\", %lu) failed",
+                RelationGetRelationName(relation),
+                (unsigned long) ItemPointerGetBlockNumber(&ctid));
 
-   /*
-    * check time qualification of tid
-    */
+       LockBuffer(buffer, BUFFER_LOCK_SHARE);
+       dp = (PageHeader) BufferGetPage(buffer);
 
-   HeapTupleSatisfies(&tp, relation, buffer, dp,
-                      snapshot, 0, (ScanKey) NULL, valid);
+       /*
+        * Check for bogus item number.  This is not treated as an error
+        * condition because it can happen while following a t_ctid link.
+        * We just assume that the prior tid is OK and return it unchanged.
+        */
+       offnum = ItemPointerGetOffsetNumber(&ctid);
+       if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
+       {
+           LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+           ReleaseBuffer(buffer);
+           break;
+       }
+       lp = PageGetItemId(dp, offnum);
+       if (!ItemIdIsUsed(lp))
+       {
+           LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+           ReleaseBuffer(buffer);
+           break;
+       }
 
-   linkend = true;
-   if ((t_data->t_infomask & HEAP_XMIN_COMMITTED) != 0 &&
-       !ItemPointerEquals(tid, &ctid))
-       linkend = false;
+       /* OK to access the tuple */
+       tp.t_self = ctid;
+       tp.t_datamcxt = NULL;
+       tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+       tp.t_len = ItemIdGetLength(lp);
 
-   LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-   ReleaseBuffer(buffer);
+       /*
+        * After following a t_ctid link, we might arrive at an unrelated
+        * tuple.  Check for XMIN match.
+        */
+       if (TransactionIdIsValid(priorXmax) &&
+           !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
+       {
+           LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+           ReleaseBuffer(buffer);
+           break;
+       }
 
-   if (!valid)
-   {
-       if (linkend)
-           return NULL;
-       heap_get_latest_tid(relation, snapshot, &ctid);
-       *tid = ctid;
-   }
+       /*
+        * Check time qualification of tuple; if visible, set it as the new
+        * result candidate.
+        */
+       HeapTupleSatisfies(&tp, relation, buffer, dp,
+                          snapshot, 0, NULL, valid);
+       if (valid)
+           *tid = ctid;
+
+       /*
+        * If there's a valid t_ctid link, follow it, else we're done.
+        */
+       if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
+                                     HEAP_MARKED_FOR_UPDATE)) ||
+           ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
+       {
+           LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+           ReleaseBuffer(buffer);
+           break;
+       }
 
-   return tid;
+       ctid = tp.t_data->t_ctid;
+       priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+       ReleaseBuffer(buffer);
+   }               /* end of loop */
 }
 
 /*
@@ -1264,7 +1311,8 @@ simple_heap_insert(Relation relation, HeapTuple tup)
  */
 int
 heap_delete(Relation relation, ItemPointer tid,
-           ItemPointer ctid, CommandId cid)
+           ItemPointer ctid, TransactionId *update_xmax,
+           CommandId cid)
 {
    ItemId      lp;
    HeapTupleData tp;
@@ -1288,11 +1336,11 @@ heap_delete(Relation relation, ItemPointer tid,
 
    dp = (PageHeader) BufferGetPage(buffer);
    lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
+
    tp.t_datamcxt = NULL;
-   tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+   tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
    tp.t_len = ItemIdGetLength(lp);
    tp.t_self = *tid;
-   tp.t_tableOid = relation->rd_id;
 
 l1:
    sv_infomask = tp.t_data->t_infomask;
@@ -1339,7 +1387,9 @@ l1:
    if (result != HeapTupleMayBeUpdated)
    {
        Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
+       Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
        *ctid = tp.t_data->t_ctid;
+       *update_xmax = HeapTupleHeaderGetXmax(tp.t_data);
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
        ReleaseBuffer(buffer);
        return result;
@@ -1429,10 +1479,13 @@ l1:
 void
 simple_heap_delete(Relation relation, ItemPointer tid)
 {
-   ItemPointerData ctid;
    int         result;
+   ItemPointerData update_ctid;
+   TransactionId update_xmax;
 
-   result = heap_delete(relation, tid, &ctid, GetCurrentCommandId());
+   result = heap_delete(relation, tid,
+                        &update_ctid, &update_xmax,
+                        GetCurrentCommandId());
    switch (result)
    {
        case HeapTupleSelfUpdated:
@@ -1462,7 +1515,8 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  */
 int
 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
-           ItemPointer ctid, CommandId cid)
+           ItemPointer ctid, TransactionId *update_xmax,
+           CommandId cid)
 {
    ItemId      lp;
    HeapTupleData oldtup;
@@ -1547,7 +1601,9 @@ l2:
    if (result != HeapTupleMayBeUpdated)
    {
        Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
+       Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
        *ctid = oldtup.t_data->t_ctid;
+       *update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
        ReleaseBuffer(buffer);
        return result;
@@ -1767,10 +1823,13 @@ l2:
 void
 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
 {
-   ItemPointerData ctid;
    int         result;
+   ItemPointerData update_ctid;
+   TransactionId update_xmax;
 
-   result = heap_update(relation, otid, tup, &ctid, GetCurrentCommandId());
+   result = heap_update(relation, otid, tup,
+                        &update_ctid, &update_xmax,
+                        GetCurrentCommandId());
    switch (result)
    {
        case HeapTupleSelfUpdated:
@@ -1794,9 +1853,34 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
 
 /*
  * heap_mark4update        - mark a tuple for update
+ *
+ * Note that this acquires a buffer pin, which the caller must release.
+ *
+ * Input parameters:
+ * relation: relation containing tuple (caller must hold suitable lock)
+ * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
+ * cid: current command ID (used for visibility test, and stored into
+ *     tuple's cmax if lock is successful)
+ *
+ * Output parameters:
+ * *tuple: all fields filled in
+ * *buffer: set to buffer holding tuple (pinned but not locked at exit)
+ * *ctid: set to tuple's t_ctid, but only in failure cases
+ * *update_xmax: set to tuple's xmax, but only in failure cases
+ *
+ * Function result may be:
+ * HeapTupleMayBeUpdated: lock was successfully acquired
+ * HeapTupleSelfUpdated: lock failed because tuple updated by self
+ * HeapTupleUpdated: lock failed because tuple updated by other xact
+ *
+ * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
+ * If t_ctid is the same as t_self, the tuple was deleted; if different, the
+ * tuple was updated, and t_ctid is the location of the replacement tuple.
+ * (t_xmax is needed to verify that the replacement tuple matches.)
  */
 int
 heap_mark4update(Relation relation, HeapTuple tuple, Buffer *buffer,
+                ItemPointer ctid, TransactionId *update_xmax,
                 CommandId cid)
 {
    ItemPointer tid = &(tuple->t_self);
@@ -1818,9 +1902,12 @@ heap_mark4update(Relation relation, HeapTuple tuple, Buffer *buffer,
 
    dp = (PageHeader) BufferGetPage(*buffer);
    lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
+   Assert(ItemIdIsUsed(lp));
+
    tuple->t_datamcxt = NULL;
    tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
    tuple->t_len = ItemIdGetLength(lp);
+   tuple->t_tableOid = RelationGetRelid(relation);
 
 l3:
    sv_infomask = tuple->t_data->t_infomask;
@@ -1867,7 +1954,9 @@ l3:
    if (result != HeapTupleMayBeUpdated)
    {
        Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
-       tuple->t_self = tuple->t_data->t_ctid;
+       Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
+       *ctid = tuple->t_data->t_ctid;
+       *update_xmax = HeapTupleHeaderGetXmax(tuple->t_data);
        LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
        return result;
    }
index 09c593db37a1a961f83701578b992fdcced91c6e..125a9d2063e7685de15bd12c5f9fab829ac96323 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/commands/trigger.c,v 1.136.2.2 2003/05/19 17:23:54 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/commands/trigger.c,v 1.136.2.3 2005/08/26 20:07:15 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1351,14 +1351,18 @@ GetTupleForTrigger(EState *estate, ResultRelInfo *relinfo,
    if (newSlot != NULL)
    {
        int         test;
+       ItemPointerData update_ctid;
+       TransactionId update_xmax;
+
+       *newSlot = NULL;
 
        /*
         * mark tuple for update
         */
-       *newSlot = NULL;
-       tuple.t_self = *tid;
 ltrmark:;
-       test = heap_mark4update(relation, &tuple, &buffer, cid);
+       tuple.t_self = *tid;
+       test = heap_mark4update(relation, &tuple, &buffer,
+                               &update_ctid, &update_xmax, cid);
        switch (test)
        {
            case HeapTupleSelfUpdated:
@@ -1373,15 +1377,18 @@ ltrmark:;
                ReleaseBuffer(buffer);
                if (XactIsoLevel == XACT_SERIALIZABLE)
                    elog(ERROR, "Can't serialize access due to concurrent update");
-               else if (!(ItemPointerEquals(&(tuple.t_self), tid)))
+               else if (!ItemPointerEquals(&update_ctid, &tuple.t_self))
                {
-                   TupleTableSlot *epqslot = EvalPlanQual(estate,
-                                            relinfo->ri_RangeTableIndex,
-                                                       &(tuple.t_self));
-
-                   if (!(TupIsNull(epqslot)))
+                   /* it was updated, so look at the updated version */
+                   TupleTableSlot *epqslot;
+
+                   epqslot = EvalPlanQual(estate,
+                                          relinfo->ri_RangeTableIndex,
+                                          &update_ctid,
+                                          update_xmax);
+                   if (!TupIsNull(epqslot))
                    {
-                       *tid = tuple.t_self;
+                       *tid = update_ctid;
                        *newSlot = epqslot;
                        goto ltrmark;
                    }
@@ -1418,6 +1425,7 @@ ltrmark:;
        tuple.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
        tuple.t_len = ItemIdGetLength(lp);
        tuple.t_self = *tid;
+       tuple.t_tableOid = RelationGetRelid(relation);
    }
 
    result = heap_copytuple(&tuple);
index 51082689250d264a96a7c340b0399a1bdcea8817..0b40fbde6183125100912a371ae4e4bd25df5249 100644 (file)
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.244 2002/10/31 19:25:29 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.244.2.1 2005/08/26 20:07:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1646,8 +1646,6 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                Buffer      Cbuf = buf;
                bool        freeCbuf = false;
                bool        chain_move_failed = false;
-               Page        Cpage;
-               ItemId      Citemid;
                ItemPointerData Ctid;
                HeapTupleData tp = tuple;
                Size        tlen = tuple_len;
@@ -1671,68 +1669,85 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                    break;      /* out of walk-along-page loop */
                }
 
-               vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
-               num_vtmove = 0;
-               free_vtmove = 100;
-
                /*
                 * If this tuple is in the begin/middle of the chain then
-                * we have to move to the end of chain.
+                * we have to move to the end of chain.  As with any
+                * t_ctid chase, we have to verify that each new tuple
+                * is really the descendant of the tuple we came from.
                 */
                while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
                                              HEAP_MARKED_FOR_UPDATE)) &&
                       !(ItemPointerEquals(&(tp.t_self),
                                           &(tp.t_data->t_ctid))))
                {
-                   Ctid = tp.t_data->t_ctid;
-                   if (freeCbuf)
-                       ReleaseBuffer(Cbuf);
-                   freeCbuf = true;
-                   Cbuf = ReadBuffer(onerel,
-                                     ItemPointerGetBlockNumber(&Ctid));
-                   Cpage = BufferGetPage(Cbuf);
-                   Citemid = PageGetItemId(Cpage,
-                                     ItemPointerGetOffsetNumber(&Ctid));
-                   if (!ItemIdIsUsed(Citemid))
+                   ItemPointerData nextTid;
+                   TransactionId priorXmax;
+                   Buffer      nextBuf;
+                   Page        nextPage;
+                   OffsetNumber nextOffnum;
+                   ItemId      nextItemid;
+                   HeapTupleHeader nextTdata;
+
+                   nextTid = tp.t_data->t_ctid;
+                   priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+                   /* assume block# is OK (see heap_fetch comments) */
+                   nextBuf = ReadBuffer(onerel,
+                                        ItemPointerGetBlockNumber(&nextTid));
+                   nextPage = BufferGetPage(nextBuf);
+                   /* If bogus or unused slot, assume tp is end of chain */
+                   nextOffnum = ItemPointerGetOffsetNumber(&nextTid);
+                   if (nextOffnum < FirstOffsetNumber ||
+                       nextOffnum > PageGetMaxOffsetNumber(nextPage))
                    {
-                       /*
-                        * This means that in the middle of chain there
-                        * was tuple updated by older (than OldestXmin)
-                        * xaction and this tuple is already deleted by
-                        * me. Actually, upper part of chain should be
-                        * removed and seems that this should be handled
-                        * in scan_heap(), but it's not implemented at the
-                        * moment and so we just stop shrinking here.
-                        */
-                       elog(DEBUG1, "Child itemid in update-chain marked as unused - can't continue repair_frag");
-                       chain_move_failed = true;
-                       break;  /* out of loop to move to chain end */
+                       ReleaseBuffer(nextBuf);
+                       break;
                    }
+                   nextItemid = PageGetItemId(nextPage, nextOffnum);
+                   if (!ItemIdIsUsed(nextItemid))
+                   {
+                       ReleaseBuffer(nextBuf);
+                       break;
+                   }
+                   /* if not matching XMIN, assume tp is end of chain */
+                   nextTdata = (HeapTupleHeader) PageGetItem(nextPage,
+                                                             nextItemid);
+                   if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata),
+                                            priorXmax))
+                   {
+                       ReleaseBuffer(nextBuf);
+                       break;
+                   }
+                   /* OK, switch our attention to the next tuple in chain */
                    tp.t_datamcxt = NULL;
-                   tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
-                   tp.t_self = Ctid;
-                   tlen = tp.t_len = ItemIdGetLength(Citemid);
-               }
-               if (chain_move_failed)
-               {
+                   tp.t_data = nextTdata;
+                   tp.t_self = nextTid;
+                   tlen = tp.t_len = ItemIdGetLength(nextItemid);
                    if (freeCbuf)
                        ReleaseBuffer(Cbuf);
-                   pfree(vtmove);
-                   break;      /* out of walk-along-page loop */
+                   Cbuf = nextBuf;
+                   freeCbuf = true;
                }
 
+               /* Set up workspace for planning the chain move */
+               vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
+               num_vtmove = 0;
+               free_vtmove = 100;
+
                /*
-                * Check if all items in chain can be moved
+                * Now, walk backwards up the chain (towards older tuples)
+                * and check if all items in chain can be moved.  We record
+                * all the moves that need to be made in the vtmove array.
                 */
                for (;;)
                {
                    Buffer      Pbuf;
                    Page        Ppage;
                    ItemId      Pitemid;
-                   HeapTupleData Ptp;
+                   HeapTupleHeader PTdata;
                    VTupleLinkData vtld,
                               *vtlp;
 
+                   /* Identify a target page to move this tuple to */
                    if (to_vacpage == NULL ||
                        !enough_space(to_vacpage, tlen))
                    {
@@ -1802,18 +1817,17 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                    /* this can't happen since we saw tuple earlier: */
                    if (!ItemIdIsUsed(Pitemid))
                        elog(ERROR, "Parent itemid marked as unused");
-                   Ptp.t_datamcxt = NULL;
-                   Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
+                   PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
 
                    /* ctid should not have changed since we saved it */
                    Assert(ItemPointerEquals(&(vtld.new_tid),
-                                            &(Ptp.t_data->t_ctid)));
+                                            &(PTdata->t_ctid)));
 
                    /*
-                    * Read above about cases when !ItemIdIsUsed(Citemid)
+                    * Read above about cases when !ItemIdIsUsed(nextItemid)
                     * (child item is removed)... Due to the fact that at
                     * the moment we don't remove unuseful part of
-                    * update-chain, it's possible to get too old parent
+                    * update-chain, it's possible to get non-matching parent
                     * row here. Like as in the case which caused this
                     * problem, we stop shrinking here. I could try to
                     * find real parent row but want not to do it because
@@ -1821,7 +1835,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                     * and we are too close to 6.5 release. - vadim
                     * 06/11/99
                     */
-                   if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
+                   if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata),
                                     HeapTupleHeaderGetXmin(tp.t_data))))
                    {
                        ReleaseBuffer(Pbuf);
@@ -1829,8 +1843,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                        chain_move_failed = true;
                        break;  /* out of check-all-items loop */
                    }
-                   tp.t_datamcxt = Ptp.t_datamcxt;
-                   tp.t_data = Ptp.t_data;
+                   tp.t_datamcxt = NULL;
+                   tp.t_data = PTdata;
                    tlen = tp.t_len = ItemIdGetLength(Pitemid);
                    if (freeCbuf)
                        ReleaseBuffer(Cbuf);
@@ -1865,6 +1879,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                for (ti = 0; ti < num_vtmove; ti++)
                {
                    VacPage     destvacpage = vtmove[ti].vacpage;
+                   Page        Cpage;
+                   ItemId      Citemid;
 
                    /* Get page to move from */
                    tuple.t_self = vtmove[ti].tid;
@@ -1953,16 +1969,27 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                         InvalidOffsetNumber,
                                         LP_USED);
                    if (newoff == InvalidOffsetNumber)
-                   {
                        elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
                          (unsigned long) tuple_len, destvacpage->blkno);
-                   }
                    newitemid = PageGetItemId(ToPage, newoff);
+                   /* drop temporary copy, and point to the version on the dest page */
                    pfree(newtup.t_data);
                    newtup.t_datamcxt = NULL;
                    newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
+
                    ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
 
+                   /*
+                    * Set new tuple's t_ctid pointing to itself for last
+                    * tuple in chain, and to next tuple in chain
+                    * otherwise.
+                    */
+                   if (!ItemPointerIsValid(&Ctid))
+                       newtup.t_data->t_ctid = newtup.t_self;
+                   else
+                       newtup.t_data->t_ctid = Ctid;
+                   Ctid = newtup.t_self;
+
                    /* XLOG stuff */
                    if (!onerel->rd_istemp)
                    {
@@ -1992,17 +2019,6 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                    if (destvacpage->blkno > last_move_dest_block)
                        last_move_dest_block = destvacpage->blkno;
 
-                   /*
-                    * Set new tuple's t_ctid pointing to itself for last
-                    * tuple in chain, and to next tuple in chain
-                    * otherwise.
-                    */
-                   if (!ItemPointerIsValid(&Ctid))
-                       newtup.t_data->t_ctid = newtup.t_self;
-                   else
-                       newtup.t_data->t_ctid = Ctid;
-                   Ctid = newtup.t_self;
-
                    num_moved++;
 
                    /*
index 622cfa937138f69c9e7d68614ed9fb21d47166ba..b6684579f1be524e67b598a850bfd4515a8db62d 100644 (file)
@@ -27,7 +27,7 @@
  *
  *
  * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/executor/execMain.c,v 1.180.2.2 2003/03/27 14:33:21 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/executor/execMain.c,v 1.180.2.3 2005/08/26 20:07:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1012,8 +1012,10 @@ lnext:   ;
                foreach(l, estate->es_rowMark)
                {
                    execRowMark *erm = lfirst(l);
-                   Buffer      buffer;
                    HeapTupleData tuple;
+                   Buffer      buffer;
+                   ItemPointerData update_ctid;
+                   TransactionId update_xmax;
                    TupleTableSlot *newSlot;
                    int         test;
 
@@ -1032,6 +1034,7 @@ lnext:    ;
 
                    tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
                    test = heap_mark4update(erm->relation, &tuple, &buffer,
+                                           &update_ctid, &update_xmax,
                                            estate->es_snapshot->curcid);
                    ReleaseBuffer(buffer);
                    switch (test)
@@ -1046,11 +1049,15 @@ lnext:  ;
                        case HeapTupleUpdated:
                            if (XactIsoLevel == XACT_SERIALIZABLE)
                                elog(ERROR, "Can't serialize access due to concurrent update");
-                           if (!(ItemPointerEquals(&(tuple.t_self),
-                                 (ItemPointer) DatumGetPointer(datum))))
+                           if (!ItemPointerEquals(&update_ctid,
+                                                  &tuple.t_self))
                            {
-                               newSlot = EvalPlanQual(estate, erm->rti, &(tuple.t_self));
-                               if (!(TupIsNull(newSlot)))
+                               /* updated, so look at updated version */
+                               newSlot = EvalPlanQual(estate,
+                                                      erm->rti,
+                                                      &update_ctid,
+                                                      update_xmax);
+                               if (!TupIsNull(newSlot))
                                {
                                    slot = newSlot;
                                    estate->es_useEvalPlan = true;
@@ -1280,8 +1287,9 @@ ExecDelete(TupleTableSlot *slot,
 {
    ResultRelInfo *resultRelInfo;
    Relation    resultRelationDesc;
-   ItemPointerData ctid;
    int         result;
+   ItemPointerData update_ctid;
+   TransactionId update_xmax;
 
    /*
     * get information on the (current) result relation
@@ -1307,7 +1315,7 @@ ExecDelete(TupleTableSlot *slot,
     */
 ldelete:;
    result = heap_delete(resultRelationDesc, tupleid,
-                        &ctid,
+                        &update_ctid, &update_xmax,
                         estate->es_snapshot->curcid);
    switch (result)
    {
@@ -1321,14 +1329,17 @@ ldelete:;
        case HeapTupleUpdated:
            if (XactIsoLevel == XACT_SERIALIZABLE)
                elog(ERROR, "Can't serialize access due to concurrent update");
-           else if (!(ItemPointerEquals(tupleid, &ctid)))
+           else if (!ItemPointerEquals(tupleid, &update_ctid))
            {
-               TupleTableSlot *epqslot = EvalPlanQual(estate,
-                              resultRelInfo->ri_RangeTableIndex, &ctid);
+               TupleTableSlot *epqslot;
 
+               epqslot = EvalPlanQual(estate,
+                                      resultRelInfo->ri_RangeTableIndex,
+                                      &update_ctid,
+                                      update_xmax);
                if (!TupIsNull(epqslot))
                {
-                   *tupleid = ctid;
+                   *tupleid = update_ctid;
                    goto ldelete;
                }
            }
@@ -1376,8 +1387,9 @@ ExecUpdate(TupleTableSlot *slot,
    HeapTuple   tuple;
    ResultRelInfo *resultRelInfo;
    Relation    resultRelationDesc;
-   ItemPointerData ctid;
    int         result;
+   ItemPointerData update_ctid;
+   TransactionId update_xmax;
    int         numIndices;
 
    /*
@@ -1443,7 +1455,7 @@ lreplace:;
     * replace the heap tuple
     */
    result = heap_update(resultRelationDesc, tupleid, tuple,
-                        &ctid,
+                        &update_ctid, &update_xmax,
                         estate->es_snapshot->curcid);
    switch (result)
    {
@@ -1457,14 +1469,17 @@ lreplace:;
        case HeapTupleUpdated:
            if (XactIsoLevel == XACT_SERIALIZABLE)
                elog(ERROR, "Can't serialize access due to concurrent update");
-           else if (!(ItemPointerEquals(tupleid, &ctid)))
+           else if (!(ItemPointerEquals(tupleid, &update_ctid)))
            {
-               TupleTableSlot *epqslot = EvalPlanQual(estate,
-                              resultRelInfo->ri_RangeTableIndex, &ctid);
+               TupleTableSlot *epqslot;
 
+               epqslot = EvalPlanQual(estate,
+                                      resultRelInfo->ri_RangeTableIndex,
+                                      &update_ctid,
+                                      update_xmax);
                if (!TupIsNull(epqslot))
                {
-                   *tupleid = ctid;
+                   *tupleid = update_ctid;
                    tuple = ExecRemoveJunk(estate->es_junkFilter, epqslot);
                    slot = ExecStoreTuple(tuple,
                                    estate->es_junkFilter->jf_resultSlot,
@@ -1606,9 +1621,21 @@ ExecConstraints(const char *caller, ResultRelInfo *resultRelInfo,
  * under READ COMMITTED rules.
  *
  * See backend/executor/README for some info about how this works.
+ *
+ * estate - executor state data
+ * rti - rangetable index of table containing tuple
+ * *tid - t_ctid from the outdated tuple (ie, next updated version)
+ * priorXmax - t_xmax from the outdated tuple
+ *
+ * *tid is also an output parameter: it's modified to hold the TID of the
+ * latest version of the tuple (note this may be changed even on failure)
+ *
+ * Returns a slot containing the new candidate update/delete tuple, or
+ * NULL if we determine we shouldn't process the row.
  */
 TupleTableSlot *
-EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
+EvalPlanQual(EState *estate, Index rti,
+            ItemPointer tid, TransactionId priorXmax)
 {
    evalPlanQual *epq;
    EState     *epqstate;
@@ -1653,10 +1680,24 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
    {
        Buffer      buffer;
 
-       if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, false, NULL))
+       if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, true, NULL))
        {
-           TransactionId xwait = SnapshotDirty->xmax;
+           /*
+            * If xmin isn't what we're expecting, the slot must have been
+            * recycled and reused for an unrelated tuple.  This implies
+            * that the latest version of the row was deleted, so we need
+            * do nothing.  (Should be safe to examine xmin without getting
+            * buffer's content lock, since xmin never changes in an existing
+            * tuple.)
+            */
+           if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data),
+                                    priorXmax))
+           {
+               ReleaseBuffer(buffer);
+               return NULL;
+           }
 
+           /* otherwise xmin should not be dirty... */
            if (TransactionIdIsValid(SnapshotDirty->xmin))
                elog(ERROR, "EvalPlanQual: t_xmin is uncommitted ?!");
 
@@ -1664,11 +1705,11 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
             * If tuple is being updated by other transaction then we have
             * to wait for its commit/abort.
             */
-           if (TransactionIdIsValid(xwait))
+           if (TransactionIdIsValid(SnapshotDirty->xmax))
            {
                ReleaseBuffer(buffer);
-               XactLockTableWait(xwait);
-               continue;
+               XactLockTableWait(SnapshotDirty->xmax);
+               continue;       /* loop back to repeat heap_fetch */
            }
 
            /*
@@ -1680,22 +1721,50 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
        }
 
        /*
-        * Oops! Invalid tuple. Have to check is it updated or deleted.
-        * Note that it's possible to get invalid SnapshotDirty->tid if
-        * tuple updated by this transaction. Have we to check this ?
+        * If the referenced slot was actually empty, the latest version
+        * of the row must have been deleted, so we need do nothing.
+        */
+       if (tuple.t_data == NULL)
+       {
+           ReleaseBuffer(buffer);
+           return NULL;
+       }
+
+       /*
+        * As above, if xmin isn't what we're expecting, do nothing.
         */
-       if (ItemPointerIsValid(&(SnapshotDirty->tid)) &&
-           !(ItemPointerEquals(&(tuple.t_self), &(SnapshotDirty->tid))))
+       if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data),
+                                priorXmax))
        {
-           /* updated, so look at the updated copy */
-           tuple.t_self = SnapshotDirty->tid;
-           continue;
+           ReleaseBuffer(buffer);
+           return NULL;
        }
 
        /*
-        * Deleted or updated by this transaction; forget it.
+        * If we get here, the tuple was found but failed SnapshotDirty.
+        * Assuming the xmin is either a committed xact or our own xact
+        * (as it certainly should be if we're trying to modify the tuple),
+        * this must mean that the row was updated or deleted by either
+        * a committed xact or our own xact.  If it was deleted, we can
+        * ignore it; if it was updated then chain up to the next version
+        * and repeat the whole test.
+        *
+        * As above, it should be safe to examine xmax and t_ctid without
+        * the buffer content lock, because they can't be changing.
         */
-       return NULL;
+       if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid))
+       {
+           /* deleted, so forget about it */
+           ReleaseBuffer(buffer);
+           return NULL;
+       }
+
+       /* updated, so look at the updated row */
+       tuple.t_self = tuple.t_data->t_ctid;
+       /* updated row should have xmin matching this xmax */
+       priorXmax = HeapTupleHeaderGetXmax(tuple.t_data);
+       ReleaseBuffer(buffer);
+       /* loop back to fetch next in chain */
    }
 
    /*
index fb9323d9e86f121441b1b7681eb80f674b0e385c..25e7af0872fff415211e06de523a3a233b40917b 100644 (file)
@@ -30,7 +30,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/utils/time/tqual.c,v 1.61.2.1 2005/05/07 21:23:24 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/utils/time/tqual.c,v 1.61.2.2 2005/08/26 20:07:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -931,10 +931,13 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin)
                            HeapTupleHeaderGetXmax(tuple)))
    {
        /*
-        * inserter also deleted it, so it was never visible to anyone
-        * else
+        * Inserter also deleted it, so it was never visible to anyone
+        * else.  However, we can only remove it early if it's not an
+        * updated tuple; else its parent tuple is linking to it via t_ctid,
+        * and this tuple mustn't go away before the parent does.
         */
-       return HEAPTUPLE_DEAD;
+       if (!(tuple->t_infomask & HEAP_UPDATED))
+           return HEAPTUPLE_DEAD;
    }
 
    if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin))
index 61c95268087104165face6dfef898e44ae8e6e65..9241afff8a88ee07a082acd627090e1c419f615f 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: heapam.h,v 1.79 2002/09/04 20:31:36 momjian Exp $
+ * $Id: heapam.h,v 1.79.2.1 2005/08/26 20:07:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -156,17 +156,21 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot,
           HeapTuple tuple, Buffer *userbuf, bool keep_buf,
           PgStat_Info *pgstat_info);
 
-extern ItemPointer heap_get_latest_tid(Relation relation, Snapshot snapshot,
+extern void heap_get_latest_tid(Relation relation, Snapshot snapshot,
                    ItemPointer tid);
 extern void setLastTid(const ItemPointer tid);
 
 extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid);
-extern int heap_delete(Relation relation, ItemPointer tid, ItemPointer ctid,
-           CommandId cid);
-extern int heap_update(Relation relation, ItemPointer otid, HeapTuple tup,
-           ItemPointer ctid, CommandId cid);
-extern int heap_mark4update(Relation relation, HeapTuple tup,
-                Buffer *userbuf, CommandId cid);
+extern int heap_delete(Relation relation, ItemPointer tid,
+                      ItemPointer ctid, TransactionId *update_xmax,
+                      CommandId cid);
+extern int heap_update(Relation relation, ItemPointer otid,
+                      HeapTuple newtup,
+                      ItemPointer ctid, TransactionId *update_xmax,
+                      CommandId cid);
+extern int heap_mark4update(Relation relation, HeapTuple tuple,
+                           Buffer *buffer, ItemPointer ctid,
+                           TransactionId *update_xmax, CommandId cid);
 
 extern Oid simple_heap_insert(Relation relation, HeapTuple tup);
 extern void simple_heap_delete(Relation relation, ItemPointer tid);
index a12c31bff7c4f6be7e63cb5a777056d5575f4237..734edc1cedac1ddb22efa1d206698074de828e98 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: executor.h,v 1.78 2002/09/04 20:31:42 momjian Exp $
+ * $Id: executor.h,v 1.78.2.1 2005/08/26 20:07:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -55,7 +55,7 @@ extern void ExecutorEnd(QueryDesc *queryDesc, EState *estate);
 extern void ExecConstraints(const char *caller, ResultRelInfo *resultRelInfo,
                TupleTableSlot *slot, EState *estate);
 extern TupleTableSlot *EvalPlanQual(EState *estate, Index rti,
-            ItemPointer tid);
+                                   ItemPointer tid, TransactionId priorXmax);
 
 /*
  * prototypes from functions in execProcnode.c