Back-patch fixes for problems with VACUUM destroying t_ctid chains too soon,

author Tom Lane <[email protected]>

Thu, 25 Aug 2005 19:45:06 +0000 (19:45 +0000)

committer Tom Lane <[email protected]>

Thu, 25 Aug 2005 19:45:06 +0000 (19:45 +0000)
author Tom Lane <[email protected]>
Thu, 25 Aug 2005 19:45:06 +0000 (19:45 +0000)
committer Tom Lane <[email protected]>
Thu, 25 Aug 2005 19:45:06 +0000 (19:45 +0000)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index e26e17956217e9af61d0659df7759307f2da8d61..e242739b9a82f7af87fcb0e702cf35c83ce2bc66 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1015,83 +1015,130 @@ heap_release_fetch(Relation relation,
  
  /*
   *     heap_get_latest_tid -  get the latest tid of a specified tuple
+ *
+ * Actually, this gets the latest version that is visible according to
+ * the passed snapshot.  You can pass SnapshotDirty to get the very latest,
+ * possibly uncommitted version.
+ *
+ * *tid is both an input and an output parameter: it is updated to
+ * show the latest version of the row.  Note that it will not be changed
+ * if no version of the row passes the snapshot test.
   */
-ItemPointer
+void
  heap_get_latest_tid(Relation relation,
                                         Snapshot snapshot,
                                         ItemPointer tid)
  {
-       ItemId          lp = NULL;
-       Buffer          buffer;
-       PageHeader      dp;
-       OffsetNumber offnum;
-       HeapTupleData tp;
-       HeapTupleHeader t_data;
+       BlockNumber     blk;
         ItemPointerData ctid;
-       bool            invalidBlock,
-                               linkend,
-                               valid;
+       TransactionId priorXmax;
  
-       /*
-        * get the buffer from the relation descriptor Note that this does a
-        * buffer pin.
-        */
-       buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
-       LockBuffer(buffer, BUFFER_LOCK_SHARE);
+       /* this is to avoid Assert failures on bad input */
+       if (!ItemPointerIsValid(tid))
+               return;
  
         /*
-        * get the item line pointer corresponding to the requested tid
+        * Since this can be called with user-supplied TID, don't trust the
+        * input too much.  (RelationGetNumberOfBlocks is an expensive check,
+        * so we don't check t_ctid links again this way.  Note that it would
+        * not do to call it just once and save the result, either.)
          */
-       dp = (PageHeader) BufferGetPage(buffer);
-       offnum = ItemPointerGetOffsetNumber(tid);
-       invalidBlock = true;
-       if (!PageIsNew(dp))
-       {
-               lp = PageGetItemId(dp, offnum);
-               if (ItemIdIsUsed(lp))
-                       invalidBlock = false;
-       }
-       if (invalidBlock)
-       {
-               LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-               ReleaseBuffer(buffer);
-               return NULL;
-       }
+       blk = ItemPointerGetBlockNumber(tid);
+       if (blk >= RelationGetNumberOfBlocks(relation))
+               elog(ERROR, "block number %u is out of range for relation \"%s\"",
+                        blk, RelationGetRelationName(relation));
  
         /*
-        * more sanity checks
+        * Loop to chase down t_ctid links.  At top of loop, ctid is the
+        * tuple we need to examine, and *tid is the TID we will return if
+        * ctid turns out to be bogus.
+        *
+        * Note that we will loop until we reach the end of the t_ctid chain.
+        * Depending on the snapshot passed, there might be at most one visible
+        * version of the row, but we don't try to optimize for that.
          */
+       ctid = *tid;
+       priorXmax = InvalidTransactionId;       /* cannot check first XMIN */
+       for (;;)
+       {
+               Buffer          buffer;
+               PageHeader      dp;
+               OffsetNumber offnum;
+               ItemId          lp;
+               HeapTupleData tp;
+               bool            valid;
  
-       tp.t_datamcxt = NULL;
-       t_data = tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
-       tp.t_len = ItemIdGetLength(lp);
-       tp.t_self = *tid;
-       ctid = tp.t_data->t_ctid;
+               /*
+                * Read, pin, and lock the page.
+                */
+               buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
+               LockBuffer(buffer, BUFFER_LOCK_SHARE);
+               dp = (PageHeader) BufferGetPage(buffer);
  
-       /*
-        * check time qualification of tid
-        */
+               /*
+                * Check for bogus item number.  This is not treated as an error
+                * condition because it can happen while following a t_ctid link.
+                * We just assume that the prior tid is OK and return it unchanged.
+                */
+               offnum = ItemPointerGetOffsetNumber(&ctid);
+               if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
+               {
+                       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+                       ReleaseBuffer(buffer);
+                       break;
+               }
+               lp = PageGetItemId(dp, offnum);
+               if (!ItemIdIsUsed(lp))
+               {
+                       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+                       ReleaseBuffer(buffer);
+                       break;
+               }
  
-       HeapTupleSatisfies(&tp, relation, buffer, dp,
-                                          snapshot, 0, NULL, valid);
+               /* OK to access the tuple */
+               tp.t_self = ctid;
+               tp.t_datamcxt = NULL;
+               tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+               tp.t_len = ItemIdGetLength(lp);
  
-       linkend = true;
-       if ((t_data->t_infomask & HEAP_XMIN_COMMITTED) != 0 &&
-               !ItemPointerEquals(tid, &ctid))
-               linkend = false;
+               /*
+                * After following a t_ctid link, we might arrive at an unrelated
+                * tuple.  Check for XMIN match.
+                */
+               if (TransactionIdIsValid(priorXmax) &&
+                       !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
+               {
+                       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+                       ReleaseBuffer(buffer);
+                       break;
+               }
  
-       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-       ReleaseBuffer(buffer);
+               /*
+                * Check time qualification of tuple; if visible, set it as the new
+                * result candidate.
+                */
+               HeapTupleSatisfies(&tp, relation, buffer, dp,
+                                                  snapshot, 0, NULL, valid);
+               if (valid)
+                       *tid = ctid;
  
-       if (!valid)
-       {
-               if (linkend)
-                       return NULL;
-               heap_get_latest_tid(relation, snapshot, &ctid);
-               *tid = ctid;
-       }
+               /*
+                * If there's a valid t_ctid link, follow it, else we're done.
+                */
+               if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
+                                                                         HEAP_MARKED_FOR_UPDATE)) ||
+                       ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
+               {
+                       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+                       ReleaseBuffer(buffer);
+                       break;
+               }
  
-       return tid;
+               ctid = tp.t_data->t_ctid;
+               priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+               LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+               ReleaseBuffer(buffer);
+       }                               /* end of loop */
  }
  
  /*
@@ -1250,29 +1297,34 @@ simple_heap_insert(Relation relation, HeapTuple tup)
  }
  
  /*
- *     heap_delete             - delete a tuple
+ *     heap_delete - delete a tuple
   *
   * NB: do not call this directly unless you are prepared to deal with
   * concurrent-update conditions.  Use simple_heap_delete instead.
   *
- *     relation - table to be modified
+ *     relation - table to be modified (caller must hold suitable lock)
   *     tid - TID of tuple to be deleted
   *     ctid - output parameter, used only for failure case (see below)
- *     cid - delete command ID to use in verifying tuple visibility
+ *     update_xmax - output parameter, used only for failure case (see below)
+ *     cid - delete command ID (used for visibility test, and stored into
+ *             cmax if successful)
   *     crosscheck - if not InvalidSnapshot, also check tuple against this
   *     wait - true if should wait for any conflicting update to commit/abort
   *
   * Normal, successful return value is HeapTupleMayBeUpdated, which
   * actually means we did delete it.  Failure return codes are
   * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
- * (the last only possible if wait == false).  On a failure return,
- * *ctid is set to the ctid link of the target tuple (possibly a later
- * version of the row).
+ * (the last only possible if wait == false).
+ *
+ * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
+ * If t_ctid is the same as tid, the tuple was deleted; if different, the
+ * tuple was updated, and t_ctid is the location of the replacement tuple.
+ * (t_xmax is needed to verify that the replacement tuple matches.)
   */
  int
  heap_delete(Relation relation, ItemPointer tid,
-                       ItemPointer ctid, CommandId cid,
-                       Snapshot crosscheck, bool wait)
+                       ItemPointer ctid, TransactionId *update_xmax,
+                       CommandId cid, Snapshot crosscheck, bool wait)
  {
         TransactionId xid = GetCurrentTransactionId();
         ItemId          lp;
@@ -1288,11 +1340,11 @@ heap_delete(Relation relation, ItemPointer tid,
  
         dp = (PageHeader) BufferGetPage(buffer);
         lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
+
         tp.t_datamcxt = NULL;
-       tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+       tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
         tp.t_len = ItemIdGetLength(lp);
         tp.t_self = *tid;
-       tp.t_tableOid = relation->rd_id;
  
  l1:
         result = HeapTupleSatisfiesUpdate(tp.t_data, cid, buffer);
@@ -1346,7 +1398,9 @@ l1:
                 Assert(result == HeapTupleSelfUpdated ||
                            result == HeapTupleUpdated ||
                            result == HeapTupleBeingUpdated);
+               Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
                 *ctid = tp.t_data->t_ctid;
+               *update_xmax = HeapTupleHeaderGetXmax(tp.t_data);
                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                 ReleaseBuffer(buffer);
                 return result;
@@ -1433,11 +1487,12 @@ l1:
  void
  simple_heap_delete(Relation relation, ItemPointer tid)
  {
-       ItemPointerData ctid;
         int                     result;
+       ItemPointerData update_ctid;
+       TransactionId update_xmax;
  
         result = heap_delete(relation, tid,
-                                                &ctid,
+                                                &update_ctid, &update_xmax,
                                                  GetCurrentCommandId(), InvalidSnapshot,
                                                  true /* wait for commit */ );
         switch (result)
@@ -1467,27 +1522,33 @@ simple_heap_delete(Relation relation, ItemPointer tid)
   * NB: do not call this directly unless you are prepared to deal with
   * concurrent-update conditions.  Use simple_heap_update instead.
   *
- *     relation - table to be modified
+ *     relation - table to be modified (caller must hold suitable lock)
   *     otid - TID of old tuple to be replaced
   *     newtup - newly constructed tuple data to store
   *     ctid - output parameter, used only for failure case (see below)
- *     cid - update command ID to use in verifying old tuple visibility
+ *     update_xmax - output parameter, used only for failure case (see below)
+ *     cid - update command ID (used for visibility test, and stored into
+ *             cmax/cmin if successful)
   *     crosscheck - if not InvalidSnapshot, also check old tuple against this
   *     wait - true if should wait for any conflicting update to commit/abort
   *
   * Normal, successful return value is HeapTupleMayBeUpdated, which
   * actually means we *did* update it.  Failure return codes are
   * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
- * (the last only possible if wait == false).  On a failure return,
- * *ctid is set to the ctid link of the old tuple (possibly a later
- * version of the row).
+ * (the last only possible if wait == false).
+ *
   * On success, newtup->t_self is set to the TID where the new tuple
   * was inserted.
+ *
+ * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
+ * If t_ctid is the same as otid, the tuple was deleted; if different, the
+ * tuple was updated, and t_ctid is the location of the replacement tuple.
+ * (t_xmax is needed to verify that the replacement tuple matches.)
   */
  int
  heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
-                       ItemPointer ctid, CommandId cid,
-                       Snapshot crosscheck, bool wait)
+                       ItemPointer ctid, TransactionId *update_xmax,
+                       CommandId cid, Snapshot crosscheck, bool wait)
  {
         TransactionId xid = GetCurrentTransactionId();
         ItemId          lp;
@@ -1573,7 +1634,9 @@ l2:
                 Assert(result == HeapTupleSelfUpdated ||
                            result == HeapTupleUpdated ||
                            result == HeapTupleBeingUpdated);
+               Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
                 *ctid = oldtup.t_data->t_ctid;
+               *update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                 ReleaseBuffer(buffer);
                 return result;
@@ -1795,11 +1858,12 @@ l2:
  void
  simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
  {
-       ItemPointerData ctid;
         int                     result;
+       ItemPointerData update_ctid;
+       TransactionId update_xmax;
  
         result = heap_update(relation, otid, tup,
-                                                &ctid,
+                                                &update_ctid, &update_xmax,
                                                  GetCurrentCommandId(), InvalidSnapshot,
                                                  true /* wait for commit */ );
         switch (result)
@@ -1825,9 +1889,34 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
  
  /*
   *     heap_mark4update                - mark a tuple for update
+ *
+ * Note that this acquires a buffer pin, which the caller must release.
+ *
+ * Input parameters:
+ *     relation: relation containing tuple (caller must hold suitable lock)
+ *     tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
+ *     cid: current command ID (used for visibility test, and stored into
+ *             tuple's cmax if lock is successful)
+ *
+ * Output parameters:
+ *     *tuple: all fields filled in
+ *     *buffer: set to buffer holding tuple (pinned but not locked at exit)
+ *     *ctid: set to tuple's t_ctid, but only in failure cases
+ *     *update_xmax: set to tuple's xmax, but only in failure cases
+ *
+ * Function result may be:
+ *     HeapTupleMayBeUpdated: lock was successfully acquired
+ *     HeapTupleSelfUpdated: lock failed because tuple updated by self
+ *     HeapTupleUpdated: lock failed because tuple updated by other xact
+ *
+ * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
+ * If t_ctid is the same as t_self, the tuple was deleted; if different, the
+ * tuple was updated, and t_ctid is the location of the replacement tuple.
+ * (t_xmax is needed to verify that the replacement tuple matches.)
   */
  int
  heap_mark4update(Relation relation, HeapTuple tuple, Buffer *buffer,
+                                ItemPointer ctid, TransactionId *update_xmax,
                                  CommandId cid)
  {
         TransactionId xid = GetCurrentTransactionId();
@@ -1841,9 +1930,12 @@ heap_mark4update(Relation relation, HeapTuple tuple, Buffer *buffer,
  
         dp = (PageHeader) BufferGetPage(*buffer);
         lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
+       Assert(ItemIdIsUsed(lp));
+
         tuple->t_datamcxt = NULL;
         tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
         tuple->t_len = ItemIdGetLength(lp);
+       tuple->t_tableOid = RelationGetRelid(relation);
  
  l3:
         result = HeapTupleSatisfiesUpdate(tuple->t_data, cid, *buffer);
@@ -1887,7 +1979,9 @@ l3:
         if (result != HeapTupleMayBeUpdated)
         {
                 Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
-               tuple->t_self = tuple->t_data->t_ctid;
+               Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
+               *ctid = tuple->t_data->t_ctid;
+               *update_xmax = HeapTupleHeaderGetXmax(tuple->t_data);
                 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
                 return result;
         }
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c

index 33ee9d21b93e073df13fcc26895c5f2017e516f4..ebbf870cc5f72bff23b490716d889a1642751ef4 100644 (file)
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -520,8 +520,9 @@ AtCommit_Notify(void)
                         }
                         else if (listener->notification == 0)
                         {
-                               ItemPointerData ctid;
                                 int                     result;
+                               ItemPointerData update_ctid;
+                               TransactionId update_xmax;
  
                                 rTuple = heap_modifytuple(lTuple, lRel,
                                                                                   value, nulls, repl);
@@ -543,7 +544,7 @@ AtCommit_Notify(void)
                                  * heap_update calls.
                                  */
                                 result = heap_update(lRel, &lTuple->t_self, rTuple,
-                                                                        &ctid,
+                                                                        &update_ctid, &update_xmax,
                                                                          GetCurrentCommandId(), InvalidSnapshot,
                                                                          false /* no wait for commit */ );
                                 switch (result)
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c

index bc1bd11d6a3948652938cb74301616cf6c57cef7..ee2f5b85c15a379b0db30b1ddb25775abeef3614 100644 (file)
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -1567,14 +1567,18 @@ GetTupleForTrigger(EState *estate, ResultRelInfo *relinfo,
         if (newSlot != NULL)
         {
                 int                     test;
+               ItemPointerData update_ctid;
+               TransactionId update_xmax;
+
+               *newSlot = NULL;
  
                 /*
                  * mark tuple for update
                  */
-               *newSlot = NULL;
-               tuple.t_self = *tid;
  ltrmark:;
-               test = heap_mark4update(relation, &tuple, &buffer, cid);
+               tuple.t_self = *tid;
+               test = heap_mark4update(relation, &tuple, &buffer,
+                                                               &update_ctid, &update_xmax, cid);
                 switch (test)
                 {
                         case HeapTupleSelfUpdated:
@@ -1591,15 +1595,18 @@ ltrmark:;
                                         ereport(ERROR,
                                                         (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
                                                          errmsg("could not serialize access due to concurrent update")));
-                               else if (!(ItemPointerEquals(&(tuple.t_self), tid)))
+                               else if (!ItemPointerEquals(&update_ctid, &tuple.t_self))
                                 {
-                                       TupleTableSlot *epqslot = EvalPlanQual(estate,
-                                                                                        relinfo->ri_RangeTableIndex,
-                                                                                                               &(tuple.t_self));
-
-                                       if (!(TupIsNull(epqslot)))
+                                       /* it was updated, so look at the updated version */
+                                       TupleTableSlot *epqslot;
+
+                                       epqslot = EvalPlanQual(estate,
+                                                                                  relinfo->ri_RangeTableIndex,
+                                                                                  &update_ctid,
+                                                                                  update_xmax);
+                                       if (!TupIsNull(epqslot))
                                         {
-                                               *tid = tuple.t_self;
+                                               *tid = update_ctid;
                                                 *newSlot = epqslot;
                                                 goto ltrmark;
                                         }
@@ -1634,6 +1641,7 @@ ltrmark:;
                 tuple.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
                 tuple.t_len = ItemIdGetLength(lp);
                 tuple.t_self = *tid;
+               tuple.t_tableOid = RelationGetRelid(relation);
         }
  
         result = heap_copytuple(&tuple);
@@ -2347,14 +2355,18 @@ AfterTriggerEndQuery(void)
  
  
  /* ----------
- * AfterTriggerEndXact()
+ * AfterTriggerFireDeferred()
   *
   *     Called just before the current transaction is committed. At this
- *     time we invoke all DEFERRED triggers and tidy up.
+ *     time we invoke all pending DEFERRED triggers.
+ *
+ *     It is possible for other modules to queue additional deferred triggers
+ *     during pre-commit processing; therefore xact.c may have to call this
+ *     multiple times.
   * ----------
   */
  void
-AfterTriggerEndXact(void)
+AfterTriggerFireDeferred(void)
  {
         AfterTriggerEventList *events;
  
@@ -2369,14 +2381,14 @@ AfterTriggerEndXact(void)
          * for them to use.  (Since PortalRunUtility doesn't set a snap for
          * COMMIT, we can't assume ActiveSnapshot is valid on entry.)
          */
-       if (afterTriggers->events.head != NULL)
+       events = &afterTriggers->events;
+       if (events->head != NULL)
                 ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
  
         /*
          * Run all the remaining triggers.  Loop until they are all gone,
          * just in case some trigger queues more for us to do.
          */
-       events = &afterTriggers->events;
         while (afterTriggerMarkEvents(events, NULL, false))
         {
                 CommandId               firing_id = afterTriggers->firing_counter++;
@@ -2384,34 +2396,26 @@ AfterTriggerEndXact(void)
                 afterTriggerInvokeEvents(events, firing_id, true);
         }
  
-       /*
-        * Forget everything we know about AFTER triggers.
-        *
-        * Since all the info is in TopTransactionContext or children thereof, we
-        * need do nothing special to reclaim memory.
-        */
-       afterTriggers = NULL;
+       Assert(events->head == NULL);
  }
  
  
  /* ----------
- * AfterTriggerAbortXact()
+ * AfterTriggerEndXact()
+ *
+ *     The current transaction is finishing.
   *
- *     The current transaction has entered the abort state.
- *     All outstanding triggers are canceled so we simply throw
+ *     Any unfired triggers are canceled so we simply throw
   *     away anything we know.
+ *
+ *     Note: it is possible for this to be called repeatedly in case of
+ *     error during transaction abort; therefore, do not complain if
+ *     already closed down.
   * ----------
   */
  void
-AfterTriggerAbortXact(void)
+AfterTriggerEndXact(bool isCommit)
  {
-       /*
-        * Ignore call if we aren't in a transaction.  (Need this to survive
-        * repeat call in case of error during transaction abort.)
-        */
-       if (afterTriggers == NULL)
-               return;
-
         /*
          * Forget everything we know about AFTER triggers.
          *
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c

index 108a73ce8ddf789b7be92c8c14e6818561f3532b..a9d1cd3f7331d2cbd4b29350f0db6a2d2ceb388a 100644 (file)
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1817,72 +1817,85 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                         break;          /* out of walk-along-page loop */
                                 }
  
-                               vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
-                               num_vtmove = 0;
-                               free_vtmove = 100;
-
                                 /*
                                  * If this tuple is in the begin/middle of the chain then
-                                * we have to move to the end of chain.
+                                * we have to move to the end of chain.  As with any
+                                * t_ctid chase, we have to verify that each new tuple
+                                * is really the descendant of the tuple we came from.
                                  */
                                 while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
                                                                                           HEAP_MARKED_FOR_UPDATE)) &&
                                            !(ItemPointerEquals(&(tp.t_self),
                                                                                    &(tp.t_data->t_ctid))))
                                 {
-                                       Page            Cpage;
-                                       ItemId          Citemid;
-                                       ItemPointerData Ctid;
-
-                                       Ctid = tp.t_data->t_ctid;
-                                       if (freeCbuf)
-                                               ReleaseBuffer(Cbuf);
-                                       freeCbuf = true;
-                                       Cbuf = ReadBuffer(onerel,
-                                                                         ItemPointerGetBlockNumber(&Ctid));
-                                       Cpage = BufferGetPage(Cbuf);
-                                       Citemid = PageGetItemId(Cpage,
-                                                                         ItemPointerGetOffsetNumber(&Ctid));
-                                       if (!ItemIdIsUsed(Citemid))
+                                       ItemPointerData nextTid;
+                                       TransactionId priorXmax;
+                                       Buffer          nextBuf;
+                                       Page            nextPage;
+                                       OffsetNumber nextOffnum;
+                                       ItemId          nextItemid;
+                                       HeapTupleHeader nextTdata;
+
+                                       nextTid = tp.t_data->t_ctid;
+                                       priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+                                       /* assume block# is OK (see heap_fetch comments) */
+                                       nextBuf = ReadBuffer(onerel,
+                                                                                ItemPointerGetBlockNumber(&nextTid));
+                                       nextPage = BufferGetPage(nextBuf);
+                                       /* If bogus or unused slot, assume tp is end of chain */
+                                       nextOffnum = ItemPointerGetOffsetNumber(&nextTid);
+                                       if (nextOffnum < FirstOffsetNumber ||
+                                               nextOffnum > PageGetMaxOffsetNumber(nextPage))
                                         {
-                                               /*
-                                                * This means that in the middle of chain there
-                                                * was tuple updated by older (than OldestXmin)
-                                                * xaction and this tuple is already deleted by
-                                                * me. Actually, upper part of chain should be
-                                                * removed and seems that this should be handled
-                                                * in scan_heap(), but it's not implemented at the
-                                                * moment and so we just stop shrinking here.
-                                                */
-                                               elog(DEBUG2, "child itemid in update-chain marked as unused --- can't continue repair_frag");
-                                               chain_move_failed = true;
-                                               break;  /* out of loop to move to chain end */
+                                               ReleaseBuffer(nextBuf);
+                                               break;
                                         }
+                                       nextItemid = PageGetItemId(nextPage, nextOffnum);
+                                       if (!ItemIdIsUsed(nextItemid))
+                                       {
+                                               ReleaseBuffer(nextBuf);
+                                               break;
+                                       }
+                                       /* if not matching XMIN, assume tp is end of chain */
+                                       nextTdata = (HeapTupleHeader) PageGetItem(nextPage,
+                                                                                                                         nextItemid);
+                                       if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata),
+                                                                                        priorXmax))
+                                       {
+                                               ReleaseBuffer(nextBuf);
+                                               break;
+                                       }
+                                       /* OK, switch our attention to the next tuple in chain */
                                         tp.t_datamcxt = NULL;
-                                       tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
-                                       tp.t_self = Ctid;
-                                       tlen = tp.t_len = ItemIdGetLength(Citemid);
-                               }
-                               if (chain_move_failed)
-                               {
+                                       tp.t_data = nextTdata;
+                                       tp.t_self = nextTid;
+                                       tlen = tp.t_len = ItemIdGetLength(nextItemid);
                                         if (freeCbuf)
                                                 ReleaseBuffer(Cbuf);
-                                       pfree(vtmove);
-                                       break;          /* out of walk-along-page loop */
+                                       Cbuf = nextBuf;
+                                       freeCbuf = true;
                                 }
  
+                               /* Set up workspace for planning the chain move */
+                               vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
+                               num_vtmove = 0;
+                               free_vtmove = 100;
+
                                 /*
-                                * Check if all items in chain can be moved
+                                * Now, walk backwards up the chain (towards older tuples)
+                                * and check if all items in chain can be moved.  We record
+                                * all the moves that need to be made in the vtmove array.
                                  */
                                 for (;;)
                                 {
                                         Buffer          Pbuf;
                                         Page            Ppage;
                                         ItemId          Pitemid;
-                                       HeapTupleData Ptp;
+                                       HeapTupleHeader PTdata;
                                         VTupleLinkData vtld,
                                                            *vtlp;
  
+                                       /* Identify a target page to move this tuple to */
                                         if (to_vacpage == NULL ||
                                                 !enough_space(to_vacpage, tlen))
                                         {
@@ -1952,18 +1965,17 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                         /* this can't happen since we saw tuple earlier: */
                                         if (!ItemIdIsUsed(Pitemid))
                                                 elog(ERROR, "parent itemid marked as unused");
-                                       Ptp.t_datamcxt = NULL;
-                                       Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
+                                       PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
  
                                         /* ctid should not have changed since we saved it */
                                         Assert(ItemPointerEquals(&(vtld.new_tid),
-                                                                                        &(Ptp.t_data->t_ctid)));
+                                                                                        &(PTdata->t_ctid)));
  
                                         /*
-                                        * Read above about cases when !ItemIdIsUsed(Citemid)
+                                        * Read above about cases when !ItemIdIsUsed(nextItemid)
                                          * (child item is removed)... Due to the fact that at
                                          * the moment we don't remove unuseful part of
-                                        * update-chain, it's possible to get too old parent
+                                        * update-chain, it's possible to get non-matching parent
                                          * row here. Like as in the case which caused this
                                          * problem, we stop shrinking here. I could try to
                                          * find real parent row but want not to do it because
@@ -1971,7 +1983,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                          * and we are too close to 6.5 release. - vadim
                                          * 06/11/99
                                          */
-                                       if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
+                                       if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata),
                                                                          HeapTupleHeaderGetXmin(tp.t_data))))
                                         {
                                                 ReleaseBuffer(Pbuf);
@@ -1979,8 +1991,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                                 chain_move_failed = true;
                                                 break;  /* out of check-all-items loop */
                                         }
-                                       tp.t_datamcxt = Ptp.t_datamcxt;
-                                       tp.t_data = Ptp.t_data;
+                                       tp.t_datamcxt = NULL;
+                                       tp.t_data = PTdata;
                                         tlen = tp.t_len = ItemIdGetLength(Pitemid);
                                         if (freeCbuf)
                                                 ReleaseBuffer(Cbuf);
@@ -2499,16 +2511,27 @@ move_chain_tuple(Relation rel,
         newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
                                                  InvalidOffsetNumber, LP_USED);
         if (newoff == InvalidOffsetNumber)
-       {
                 elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
                          (unsigned long) tuple_len, dst_vacpage->blkno);
-       }
         newitemid = PageGetItemId(dst_page, newoff);
+       /* drop temporary copy, and point to the version on the dest page */
         pfree(newtup.t_data);
         newtup.t_datamcxt = NULL;
         newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
+
         ItemPointerSet(&(newtup.t_self), dst_vacpage->blkno, newoff);
  
+       /*
+        * Set new tuple's t_ctid pointing to itself if last tuple in chain,
+        * and to next tuple in chain otherwise.  (Since we move the chain
+        * in reverse order, this is actually the previously processed tuple.)
+        */
+       if (!ItemPointerIsValid(ctid))
+               newtup.t_data->t_ctid = newtup.t_self;
+       else
+               newtup.t_data->t_ctid = *ctid;
+       *ctid = newtup.t_self;
+
         /* XLOG stuff */
         if (!rel->rd_istemp)
         {
@@ -2533,17 +2556,6 @@ move_chain_tuple(Relation rel,
  
         END_CRIT_SECTION();
  
-       /*
-        * Set new tuple's t_ctid pointing to itself for last tuple in chain,
-        * and to next tuple in chain otherwise.
-        */
-       /* Is this ok after log_heap_move() and END_CRIT_SECTION()? */
-       if (!ItemPointerIsValid(ctid))
-               newtup.t_data->t_ctid = newtup.t_self;
-       else
-               newtup.t_data->t_ctid = *ctid;
-       *ctid = newtup.t_self;
-
         LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
         if (dst_buf != old_buf)
                 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c

index 28f6851b39b5bf877067235c382be6eadbf0be26..be00dffbc40edb77f6a29529fd89b440f87d276f 100644 (file)
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1114,8 +1114,10 @@ lnext:   ;
                                 foreach(l, estate->es_rowMark)
                                 {
                                         execRowMark *erm = lfirst(l);
-                                       Buffer          buffer;
                                         HeapTupleData tuple;
+                                       Buffer          buffer;
+                                       ItemPointerData update_ctid;
+                                       TransactionId update_xmax;
                                         TupleTableSlot *newSlot;
                                         int                     test;
  
@@ -1133,6 +1135,7 @@ lnext:    ;
  
                                         tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
                                         test = heap_mark4update(erm->relation, &tuple, &buffer,
+                                                                                       &update_ctid, &update_xmax,
                                                                                         estate->es_snapshot->curcid);
                                         ReleaseBuffer(buffer);
                                         switch (test)
@@ -1149,11 +1152,15 @@ lnext:  ;
                                                                 ereport(ERROR,
                                                                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
                                                                                  errmsg("could not serialize access due to concurrent update")));
-                                                       if (!(ItemPointerEquals(&(tuple.t_self),
-                                                                 (ItemPointer) DatumGetPointer(datum))))
+                                                       if (!ItemPointerEquals(&update_ctid,
+                                                                                                  &tuple.t_self))
                                                         {
-                                                               newSlot = EvalPlanQual(estate, erm->rti, &(tuple.t_self));
-                                                               if (!(TupIsNull(newSlot)))
+                                                               /* updated, so look at updated version */
+                                                               newSlot = EvalPlanQual(estate,
+                                                                                                          erm->rti,
+                                                                                                          &update_ctid,
+                                                                                                          update_xmax);
+                                                               if (!TupIsNull(newSlot))
                                                                 {
                                                                         slot = newSlot;
                                                                         estate->es_useEvalPlan = true;
@@ -1405,8 +1412,9 @@ ExecDelete(TupleTableSlot *slot,
  {
         ResultRelInfo *resultRelInfo;
         Relation        resultRelationDesc;
-       ItemPointerData ctid;
         int                     result;
+       ItemPointerData update_ctid;
+       TransactionId update_xmax;
  
         /*
          * get information on the (current) result relation
@@ -1437,7 +1445,7 @@ ExecDelete(TupleTableSlot *slot,
          */
  ldelete:;
         result = heap_delete(resultRelationDesc, tupleid,
-                                                &ctid,
+                                                &update_ctid, &update_xmax,
                                                  estate->es_snapshot->curcid,
                                                  estate->es_crosscheck_snapshot,
                                                  true /* wait for commit */ );
@@ -1455,14 +1463,17 @@ ldelete:;
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
                                                  errmsg("could not serialize access due to concurrent update")));
-                       else if (!(ItemPointerEquals(tupleid, &ctid)))
+                       else if (!ItemPointerEquals(tupleid, &update_ctid))
                         {
-                               TupleTableSlot *epqslot = EvalPlanQual(estate,
-                                                          resultRelInfo->ri_RangeTableIndex, &ctid);
+                               TupleTableSlot *epqslot;
  
+                               epqslot = EvalPlanQual(estate,
+                                                                          resultRelInfo->ri_RangeTableIndex,
+                                                                          &update_ctid,
+                                                                          update_xmax);
                                 if (!TupIsNull(epqslot))
                                 {
-                                       *tupleid = ctid;
+                                       *tupleid = update_ctid;
                                         goto ldelete;
                                 }
                         }
@@ -1509,8 +1520,9 @@ ExecUpdate(TupleTableSlot *slot,
         HeapTuple       tuple;
         ResultRelInfo *resultRelInfo;
         Relation        resultRelationDesc;
-       ItemPointerData ctid;
         int                     result;
+       ItemPointerData update_ctid;
+       TransactionId update_xmax;
         int                     numIndices;
  
         /*
@@ -1578,7 +1590,7 @@ lreplace:;
          * referential integrity updates in serializable transactions.
          */
         result = heap_update(resultRelationDesc, tupleid, tuple,
-                                                &ctid,
+                                                &update_ctid, &update_xmax,
                                                  estate->es_snapshot->curcid,
                                                  estate->es_crosscheck_snapshot,
                                                  true /* wait for commit */ );
@@ -1596,14 +1608,17 @@ lreplace:;
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
                                                  errmsg("could not serialize access due to concurrent update")));
-                       else if (!(ItemPointerEquals(tupleid, &ctid)))
+                       else if (!(ItemPointerEquals(tupleid, &update_ctid)))
                         {
-                               TupleTableSlot *epqslot = EvalPlanQual(estate,
-                                                          resultRelInfo->ri_RangeTableIndex, &ctid);
+                               TupleTableSlot *epqslot;
  
+                               epqslot = EvalPlanQual(estate,
+                                                                          resultRelInfo->ri_RangeTableIndex,
+                                                                          &update_ctid,
+                                                                          update_xmax);
                                 if (!TupIsNull(epqslot))
                                 {
-                                       *tupleid = ctid;
+                                       *tupleid = update_ctid;
                                         tuple = ExecRemoveJunk(estate->es_junkFilter, epqslot);
                                         slot = ExecStoreTuple(tuple,
                                                                         estate->es_junkFilter->jf_resultSlot,
@@ -1750,9 +1765,21 @@ ExecConstraints(ResultRelInfo *resultRelInfo,
   * under READ COMMITTED rules.
   *
   * See backend/executor/README for some info about how this works.
+ *
+ *     estate - executor state data
+ *     rti - rangetable index of table containing tuple
+ *     *tid - t_ctid from the outdated tuple (ie, next updated version)
+ *     priorXmax - t_xmax from the outdated tuple
+ *
+ * *tid is also an output parameter: it's modified to hold the TID of the
+ * latest version of the tuple (note this may be changed even on failure)
+ *
+ * Returns a slot containing the new candidate update/delete tuple, or
+ * NULL if we determine we shouldn't process the row.
   */
  TupleTableSlot *
-EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
+EvalPlanQual(EState *estate, Index rti,
+                        ItemPointer tid, TransactionId priorXmax)
  {
         evalPlanQual *epq;
         EState     *epqstate;
@@ -1796,11 +1823,24 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
         {
                 Buffer          buffer;
  
-               if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, false, NULL))
+               if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, true, NULL))
                 {
-                       TransactionId xwait = SnapshotDirty->xmax;
+                       /*
+                        * If xmin isn't what we're expecting, the slot must have been
+                        * recycled and reused for an unrelated tuple.  This implies
+                        * that the latest version of the row was deleted, so we need
+                        * do nothing.  (Should be safe to examine xmin without getting
+                        * buffer's content lock, since xmin never changes in an existing
+                        * tuple.)
+                        */
+                       if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data),
+                                                                        priorXmax))
+                       {
+                               ReleaseBuffer(buffer);
+                               return NULL;
+                       }
  
-                       /* xmin should not be dirty... */
+                       /* otherwise xmin should not be dirty... */
                         if (TransactionIdIsValid(SnapshotDirty->xmin))
                                 elog(ERROR, "t_xmin is uncommitted in tuple to be updated");
  
@@ -1808,11 +1848,11 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
                          * If tuple is being updated by other transaction then we have
                          * to wait for its commit/abort.
                          */
-                       if (TransactionIdIsValid(xwait))
+                       if (TransactionIdIsValid(SnapshotDirty->xmax))
                         {
                                 ReleaseBuffer(buffer);
-                               XactLockTableWait(xwait);
-                               continue;
+                               XactLockTableWait(SnapshotDirty->xmax);
+                               continue;               /* loop back to repeat heap_fetch */
                         }
  
                         /*
@@ -1824,22 +1864,50 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
                 }
  
                 /*
-                * Oops! Invalid tuple. Have to check is it updated or deleted.
-                * Note that it's possible to get invalid SnapshotDirty->tid if
-                * tuple updated by this transaction. Have we to check this ?
+                * If the referenced slot was actually empty, the latest version
+                * of the row must have been deleted, so we need do nothing.
                  */
-               if (ItemPointerIsValid(&(SnapshotDirty->tid)) &&
-                       !(ItemPointerEquals(&(tuple.t_self), &(SnapshotDirty->tid))))
+               if (tuple.t_data == NULL)
                 {
-                       /* updated, so look at the updated copy */
-                       tuple.t_self = SnapshotDirty->tid;
-                       continue;
+                       ReleaseBuffer(buffer);
+                       return NULL;
                 }
  
                 /*
-                * Deleted or updated by this transaction; forget it.
+                * As above, if xmin isn't what we're expecting, do nothing.
                  */
-               return NULL;
+               if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data),
+                                                                priorXmax))
+               {
+                       ReleaseBuffer(buffer);
+                       return NULL;
+               }
+
+               /*
+                * If we get here, the tuple was found but failed SnapshotDirty.
+                * Assuming the xmin is either a committed xact or our own xact
+                * (as it certainly should be if we're trying to modify the tuple),
+                * this must mean that the row was updated or deleted by either
+                * a committed xact or our own xact.  If it was deleted, we can
+                * ignore it; if it was updated then chain up to the next version
+                * and repeat the whole test.
+                *
+                * As above, it should be safe to examine xmax and t_ctid without
+                * the buffer content lock, because they can't be changing.
+                */
+               if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid))
+               {
+                       /* deleted, so forget about it */
+                       ReleaseBuffer(buffer);
+                       return NULL;
+               }
+
+               /* updated, so look at the updated row */
+               tuple.t_self = tuple.t_data->t_ctid;
+               /* updated row should have xmin matching this xmax */
+               priorXmax = HeapTupleHeaderGetXmax(tuple.t_data);
+               ReleaseBuffer(buffer);
+               /* loop back to fetch next in chain */
         }
  
         /*
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c

index b90a1dd1f1e843592fd1bf5f9ed16a08ea267c27..88b6c3690c38862c0cca96d3e13341f6f77c3ca5 100644 (file)
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -11,6 +11,22 @@
   * "hint" status bits if we see that the inserting or deleting transaction
   * has now committed or aborted.
   *
+ * NOTE: must check TransactionIdIsInProgress (which looks in PGPROC array)
+ * before TransactionIdDidCommit/TransactionIdDidAbort (which look in
+ * pg_clog).  Otherwise we have a race condition: we might decide that a
+ * just-committed transaction crashed, because none of the tests succeed.
+ * xact.c is careful to record commit/abort in pg_clog before it unsets
+ * MyProc->xid in PGPROC array.  That fixes that problem, but it also
+ * means there is a window where TransactionIdIsInProgress and
+ * TransactionIdDidCommit will both return true.  If we check only
+ * TransactionIdDidCommit, we could consider a tuple committed when a
+ * later GetSnapshotData call will still think the originating transaction
+ * is in progress, which leads to application-level inconsistency.  The
+ * upshot is that we gotta check TransactionIdIsInProgress first in all
+ * code paths, except for a few cases where we are looking at
+ * subtransactions of our own main transaction and so there can't be any
+ * race condition.
+ *
   *
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -144,19 +160,19 @@ HeapTupleSatisfiesItself(HeapTupleHeader tuple, Buffer buffer)
  
                         return false;
                 }
-               else if (!TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-               {
-                       if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(tuple)))
-                       {
-                               tuple->t_infomask |= HEAP_XMIN_INVALID;
-                               SetBufferCommitInfoNeedsSave(buffer);
-                       }
+               else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
                         return false;
+               else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
+               {
+                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       SetBufferCommitInfoNeedsSave(buffer);
                 }
                 else
                 {
-                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       /* it must have aborted or crashed */
+                       tuple->t_infomask |= HEAP_XMIN_INVALID;
                         SetBufferCommitInfoNeedsSave(buffer);
+                       return false;
                 }
         }
  
@@ -179,13 +195,14 @@ HeapTupleSatisfiesItself(HeapTupleHeader tuple, Buffer buffer)
                 return false;
         }
  
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+               return true;
+
         if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
         {
-               if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
-               {
-                       tuple->t_infomask |= HEAP_XMAX_INVALID;
-                       SetBufferCommitInfoNeedsSave(buffer);
-               }
+               /* it must have aborted or crashed */
+               tuple->t_infomask |= HEAP_XMAX_INVALID;
+               SetBufferCommitInfoNeedsSave(buffer);
                 return true;
         }
  
@@ -318,19 +335,19 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Buffer buffer)
                         else
                                 return false;   /* deleted before scan started */
                 }
-               else if (!TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-               {
-                       if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(tuple)))
-                       {
-                               tuple->t_infomask |= HEAP_XMIN_INVALID;
-                               SetBufferCommitInfoNeedsSave(buffer);
-                       }
+               else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
                         return false;
+               else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
+               {
+                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       SetBufferCommitInfoNeedsSave(buffer);
                 }
                 else
                 {
-                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       /* it must have aborted or crashed */
+                       tuple->t_infomask |= HEAP_XMIN_INVALID;
                         SetBufferCommitInfoNeedsSave(buffer);
+                       return false;
                 }
         }
  
@@ -356,13 +373,14 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Buffer buffer)
                         return false;           /* deleted before scan started */
         }
  
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+               return true;
+
         if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
         {
-               if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
-               {
-                       tuple->t_infomask |= HEAP_XMAX_INVALID;
-                       SetBufferCommitInfoNeedsSave(buffer);
-               }
+               /* it must have aborted or crashed */
+               tuple->t_infomask |= HEAP_XMAX_INVALID;
+               SetBufferCommitInfoNeedsSave(buffer);
                 return true;
         }
  
@@ -532,19 +550,19 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
                                 return HeapTupleInvisible;              /* updated before scan
                                                                                                  * started */
                 }
-               else if (!TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-               {
-                       if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(tuple)))
-                       {
-                               tuple->t_infomask |= HEAP_XMIN_INVALID;
-                               SetBufferCommitInfoNeedsSave(buffer);
-                       }
+               else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
                         return HeapTupleInvisible;
+               else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
+               {
+                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       SetBufferCommitInfoNeedsSave(buffer);
                 }
                 else
                 {
-                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       /* it must have aborted or crashed */
+                       tuple->t_infomask |= HEAP_XMIN_INVALID;
                         SetBufferCommitInfoNeedsSave(buffer);
+                       return HeapTupleInvisible;
                 }
         }
  
@@ -571,16 +589,15 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
                         return HeapTupleInvisible;      /* updated before scan started */
         }
  
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+               return HeapTupleBeingUpdated;
+
         if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
         {
-               if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
-               {
-                       tuple->t_infomask |= HEAP_XMAX_INVALID;
-                       SetBufferCommitInfoNeedsSave(buffer);
-                       return HeapTupleMayBeUpdated;
-               }
-               /* running xact */
-               return HeapTupleBeingUpdated;   /* in updation by other */
+               /* it must have aborted or crashed */
+               tuple->t_infomask |= HEAP_XMAX_INVALID;
+               SetBufferCommitInfoNeedsSave(buffer);
+               return HeapTupleMayBeUpdated;
         }
  
         /* xmax transaction committed */
@@ -684,23 +701,24 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Buffer buffer)
  
                         return false;
                 }
-               else if (!TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
+               else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
                 {
-                       if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(tuple)))
-                       {
-                               tuple->t_infomask |= HEAP_XMIN_INVALID;
-                               SetBufferCommitInfoNeedsSave(buffer);
-                               return false;
-                       }
                         SnapshotDirty->xmin = HeapTupleHeaderGetXmin(tuple);
                         /* XXX shouldn't we fall through to look at xmax? */
                         return true;            /* in insertion by other */
                 }
-               else
+               else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
                 {
                         tuple->t_infomask |= HEAP_XMIN_COMMITTED;
                         SetBufferCommitInfoNeedsSave(buffer);
                 }
+               else
+               {
+                       /* it must have aborted or crashed */
+                       tuple->t_infomask |= HEAP_XMIN_INVALID;
+                       SetBufferCommitInfoNeedsSave(buffer);
+                       return false;
+               }
         }
  
         /* by here, the inserting transaction has committed */
@@ -723,17 +741,18 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Buffer buffer)
                 return false;
         }
  
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
         {
-               if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
-               {
-                       tuple->t_infomask |= HEAP_XMAX_INVALID;
-                       SetBufferCommitInfoNeedsSave(buffer);
-                       return true;
-               }
-               /* running xact */
                 SnapshotDirty->xmax = HeapTupleHeaderGetXmax(tuple);
-               return true;                    /* in updation by other */
+               return true;
+       }
+
+       if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+       {
+               /* it must have aborted or crashed */
+               tuple->t_infomask |= HEAP_XMAX_INVALID;
+               SetBufferCommitInfoNeedsSave(buffer);
+               return true;
         }
  
         /* xmax transaction committed */
@@ -847,19 +866,19 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot,
                         else
                                 return false;   /* deleted before scan started */
                 }
-               else if (!TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-               {
-                       if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(tuple)))
-                       {
-                               tuple->t_infomask |= HEAP_XMIN_INVALID;
-                               SetBufferCommitInfoNeedsSave(buffer);
-                       }
+               else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
                         return false;
+               else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
+               {
+                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       SetBufferCommitInfoNeedsSave(buffer);
                 }
                 else
                 {
-                       tuple->t_infomask |= HEAP_XMIN_COMMITTED;
+                       /* it must have aborted or crashed */
+                       tuple->t_infomask |= HEAP_XMIN_INVALID;
                         SetBufferCommitInfoNeedsSave(buffer);
+                       return false;
                 }
         }
  
@@ -915,13 +934,14 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot,
                                 return false;   /* deleted before scan started */
                 }
  
+               if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+                       return true;
+
                 if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
                 {
-                       if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
-                       {
-                               tuple->t_infomask |= HEAP_XMAX_INVALID;
-                               SetBufferCommitInfoNeedsSave(buffer);
-                       }
+                       /* it must have aborted or crashed */
+                       tuple->t_infomask |= HEAP_XMAX_INVALID;
+                       SetBufferCommitInfoNeedsSave(buffer);
                         return true;
                 }
  
@@ -985,13 +1005,6 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
          *
          * If the inserting transaction aborted, then the tuple was never visible
          * to any other transaction, so we can delete it immediately.
-        *
-        * NOTE: must check TransactionIdIsInProgress (which looks in PROC array)
-        * before TransactionIdDidCommit/TransactionIdDidAbort (which look in
-        * pg_clog).  Otherwise we have a race condition where we might decide
-        * that a just-committed transaction crashed, because none of the
-        * tests succeed.  xact.c is careful to record commit/abort in pg_clog
-        * before it unsets MyProc->xid in PROC array.
          */
         if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED))
         {
@@ -1124,10 +1137,13 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
                                                         HeapTupleHeaderGetXmax(tuple)))
         {
                 /*
-                * inserter also deleted it, so it was never visible to anyone
-                * else
+                * Inserter also deleted it, so it was never visible to anyone
+                * else.  However, we can only remove it early if it's not an
+                * updated tuple; else its parent tuple is linking to it via t_ctid,
+                * and this tuple mustn't go away before the parent does.
                  */
-               return HEAPTUPLE_DEAD;
+               if (!(tuple->t_infomask & HEAP_UPDATED))
+                       return HEAPTUPLE_DEAD;
         }
  
         if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin))
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h

index cc4e31732f6f32b2642925ef004df801666b3118..1c1fdf8e16ec7852a7730bde06bace28a8eb20fe 100644 (file)
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -154,17 +154,21 @@ extern bool heap_release_fetch(Relation relation, Snapshot snapshot,
                                    HeapTuple tuple, Buffer *userbuf, bool keep_buf,
                                    PgStat_Info *pgstat_info);
  
-extern ItemPointer heap_get_latest_tid(Relation relation, Snapshot snapshot,
+extern void heap_get_latest_tid(Relation relation, Snapshot snapshot,
                                         ItemPointer tid);
  extern void setLastTid(const ItemPointer tid);
  
  extern Oid     heap_insert(Relation relation, HeapTuple tup, CommandId cid);
-extern int heap_delete(Relation relation, ItemPointer tid, ItemPointer ctid,
-                       CommandId cid, Snapshot crosscheck, bool wait);
-extern int heap_update(Relation relation, ItemPointer otid, HeapTuple tup,
-               ItemPointer ctid, CommandId cid, Snapshot crosscheck, bool wait);
-extern int heap_mark4update(Relation relation, HeapTuple tup,
-                                Buffer *userbuf, CommandId cid);
+extern int heap_delete(Relation relation, ItemPointer tid,
+                                          ItemPointer ctid, TransactionId *update_xmax,
+                                          CommandId cid, Snapshot crosscheck, bool wait);
+extern int heap_update(Relation relation, ItemPointer otid,
+                                          HeapTuple newtup,
+                                          ItemPointer ctid, TransactionId *update_xmax,
+                                          CommandId cid, Snapshot crosscheck, bool wait);
+extern int heap_mark4update(Relation relation, HeapTuple tuple,
+                                                       Buffer *buffer, ItemPointer ctid,
+                                                       TransactionId *update_xmax, CommandId cid);
  
  extern Oid     simple_heap_insert(Relation relation, HeapTuple tup);
  extern void simple_heap_delete(Relation relation, ItemPointer tid);
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h

index 592541debba9e3cf150b6343184f4c5c965f8b4d..385fc600f370311e791d89df9b1c573d85579f5b 100644 (file)
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -109,7 +109,7 @@ extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids);
  extern void ExecConstraints(ResultRelInfo *resultRelInfo,
                                 TupleTableSlot *slot, EState *estate);
  extern TupleTableSlot *EvalPlanQual(EState *estate, Index rti,
-                        ItemPointer tid);
+                                                                       ItemPointer tid, TransactionId priorXmax);
  
  /*
   * prototypes from functions in execProcnode.c
author	Tom Lane <[email protected]>
	Thu, 25 Aug 2005 19:45:06 +0000 (19:45 +0000)
committer	Tom Lane <[email protected]>
	Thu, 25 Aug 2005 19:45:06 +0000 (19:45 +0000)
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/backend/commands/async.c		patch \| blob \| blame \| history
src/backend/commands/trigger.c		patch \| blob \| blame \| history
src/backend/commands/vacuum.c		patch \| blob \| blame \| history
src/backend/executor/execMain.c		patch \| blob \| blame \| history
src/backend/utils/time/tqual.c		patch \| blob \| blame \| history
src/include/access/heapam.h		patch \| blob \| blame \| history
src/include/executor/executor.h		patch \| blob \| blame \| history