From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 23 Dec 2025 11:37:16 +0000 (+0200)
Subject: Fix bug in following update chain when locking a heap tuple
X-Git-Url: http://git.postgresql.org/gitweb/irc:/sta%3Cscript%20data-cfasync=?a=commitdiff_plain;h=refs%2Fremotes%2Fgithub%2Fmaster;p=postgresql.git

Fix bug in following update chain when locking a heap tuple

After waiting for a concurrent updater to finish, heap_lock_tuple()
followed the update chain to lock all tuple versions. However, when
stepping from the initial tuple to the next one, it failed to check
that the next tuple's XMIN matches the initial tuple's XMAX. That's an
important check whenever following an update chain, and the recursive
part that follows the chain did it, but the initial step missed it.
Without the check, if the updating transaction aborts, the updated
tuple is vacuumed away and replaced by an unrelated tuple, the
unrelated tuple might get incorrectly locked.

Author: Jasper Smit <jasper.smit@servicenow.com>
Discussion: https://www.postgresql.org/message-id/CAOG+RQ74x0q=kgBBQ=mezuvOeZBfSxM1qu_o0V28bwDz3dHxLw@mail.gmail.com
Backpatch-through: 14
---

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 6daf4a87dec..98759e3bf70 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -85,8 +85,11 @@ static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
 									  LockTupleMode mode, bool is_update,
 									  TransactionId *result_xmax, uint16 *result_infomask,
 									  uint16 *result_infomask2);
-static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
-										 const ItemPointerData *ctid, TransactionId xid,
+static TM_Result heap_lock_updated_tuple(Relation rel,
+										 uint16 prior_infomask,
+										 TransactionId prior_rawxmax,
+										 const ItemPointerData *prior_ctid,
+										 TransactionId xid,
 										 LockTupleMode mode);
 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
 								   uint16 *new_infomask2);
@@ -4816,11 +4819,13 @@ l3:
 				 * If there are updates, follow the update chain; bail out if
 				 * that cannot be done.
 				 */
-				if (follow_updates && updated)
+				if (follow_updates && updated &&
+					!ItemPointerEquals(&tuple->t_self, &t_ctid))
 				{
 					TM_Result	res;
 
-					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+					res = heap_lock_updated_tuple(relation,
+												  infomask, xwait, &t_ctid,
 												  GetCurrentTransactionId(),
 												  mode);
 					if (res != TM_Ok)
@@ -5063,11 +5068,13 @@ l3:
 			}
 
 			/* if there are updates, follow the update chain */
-			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+				!ItemPointerEquals(&tuple->t_self, &t_ctid))
 			{
 				TM_Result	res;
 
-				res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+				res = heap_lock_updated_tuple(relation,
+											  infomask, xwait, &t_ctid,
 											  GetCurrentTransactionId(),
 											  mode);
 				if (res != TM_Ok)
@@ -5721,7 +5728,8 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
  * version as well.
  */
 static TM_Result
-heap_lock_updated_tuple_rec(Relation rel, const ItemPointerData *tid, TransactionId xid,
+heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax,
+							const ItemPointerData *tid, TransactionId xid,
 							LockTupleMode mode)
 {
 	TM_Result	result;
@@ -5734,7 +5742,6 @@ heap_lock_updated_tuple_rec(Relation rel, const ItemPointerData *tid, Transactio
 				old_infomask2;
 	TransactionId xmax,
 				new_xmax;
-	TransactionId priorXmax = InvalidTransactionId;
 	bool		cleared_all_frozen = false;
 	bool		pinned_desired_page;
 	Buffer		vmbuffer = InvalidBuffer;
@@ -6048,7 +6055,10 @@ out_unlocked:
  *		Follow update chain when locking an updated tuple, acquiring locks (row
  *		marks) on the updated versions.
  *
- * The initial tuple is assumed to be already locked.
+ * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
+ * fields from the initial tuple.  We will lock the tuples starting from the
+ * one that 'prior_ctid' points to.  Note: This function does not lock the
+ * initial tuple itself.
  *
  * This function doesn't check visibility, it just unconditionally marks the
  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
@@ -6066,16 +6076,22 @@ out_unlocked:
  * levels, because that would lead to a serializability failure.
  */
 static TM_Result
-heap_lock_updated_tuple(Relation rel, HeapTuple tuple, const ItemPointerData *ctid,
+heap_lock_updated_tuple(Relation rel,
+						uint16 prior_infomask,
+						TransactionId prior_raw_xmax,
+						const ItemPointerData *prior_ctid,
 						TransactionId xid, LockTupleMode mode)
 {
+	INJECTION_POINT("heap_lock_updated_tuple", NULL);
+
 	/*
-	 * If the tuple has not been updated, or has moved into another partition
-	 * (effectively a delete) stop here.
+	 * If the tuple has moved into another partition (effectively a delete)
+	 * stop here.
 	 */
-	if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
-		!ItemPointerEquals(&tuple->t_self, ctid))
+	if (!ItemPointerIndicatesMovedPartitions(prior_ctid))
 	{
+		TransactionId prior_xmax;
+
 		/*
 		 * If this is the first possibly-multixact-able operation in the
 		 * current transaction, set my per-backend OldestMemberMXactId
@@ -6087,7 +6103,9 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, const ItemPointerData *ct
 		 */
 		MultiXactIdSetOldestMember();
 
-		return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+		prior_xmax = (prior_infomask & HEAP_XMAX_IS_MULTI) ?
+			MultiXactIdGetUpdateXid(prior_raw_xmax, prior_infomask) : prior_raw_xmax;
+		return heap_lock_updated_tuple_rec(rel, prior_xmax, prior_ctid, xid, mode);
 	}
 
 	/* nothing to lock */
diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile
index bfdb3f53377..cc9be6dcdd2 100644
--- a/src/test/modules/injection_points/Makefile
+++ b/src/test/modules/injection_points/Makefile
@@ -14,7 +14,8 @@ REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
 
 ISOLATION = basic \
 	    inplace \
-	    syscache-update-pruned
+	    syscache-update-pruned \
+	    heap_lock_update
 
 # Temporarily disabled because of flakiness
 #ISOLATION =+
diff --git a/src/test/modules/injection_points/expected/heap_lock_update.out b/src/test/modules/injection_points/expected/heap_lock_update.out
new file mode 100644
index 00000000000..1ec8d876414
--- /dev/null
+++ b/src/test/modules/injection_points/expected/heap_lock_update.out
@@ -0,0 +1,83 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert: 
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+	SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+	SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert_and_lock wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert_and_lock: 
+	BEGIN;
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+	COMMIT;
+	UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid |id
+-----+--
+(0,1)| 1
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+	SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+	SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build
index 493e11053dc..c9186ae04d1 100644
--- a/src/test/modules/injection_points/meson.build
+++ b/src/test/modules/injection_points/meson.build
@@ -46,6 +46,7 @@ tests += {
       'basic',
       'inplace',
       'syscache-update-pruned',
+      'heap_lock_update',
       # temporarily disabled because of flakiness
       # 'index-concurrently-upsert',
       # 'index-concurrently-upsert-predicate',
diff --git a/src/test/modules/injection_points/specs/heap_lock_update.spec b/src/test/modules/injection_points/specs/heap_lock_update.spec
new file mode 100644
index 00000000000..b3992a1eb7a
--- /dev/null
+++ b/src/test/modules/injection_points/specs/heap_lock_update.spec
@@ -0,0 +1,117 @@
+# Test race condition in tuple locking
+#
+# This is a reproducer for the bug reported at:
+# https://www.postgresql.org/message-id/CAOG%2BRQ74x0q%3DkgBBQ%3DmezuvOeZBfSxM1qu_o0V28bwDz3dHxLw%40mail.gmail.com
+#
+# The bug was that when following an update chain when locking tuples,
+# we sometimes failed to check that the xmin on the next tuple matched
+# the prior's xmax. If the updated tuple version was vacuumed away and
+# the slot was reused for an unrelated tuple, we'd incorrectly follow
+# and lock the unrelated tuple.
+
+
+# Set up a test table with enough rows to fill a page. We need the
+# UPDATE used in the test to put the new tuple on a different page,
+# because otherwise the VACUUM cannot remove the aborted tuple because
+# we hold a pin on the first page.
+#
+# The exact number of rows inserted doesn't otherwise matter, but we
+# arrange things in a deterministic fashion so that the last inserted
+# tuple goes to (1,1), and the updated and aborted tuple goes to
+# (1,2). That way we can just memorize those ctids in the expected
+# output, to verify that the test exercises the scenario we want.
+setup
+{
+	CREATE EXTENSION injection_points;
+
+	CREATE TABLE t (id int PRIMARY KEY);
+	do $$
+		DECLARE
+			i int;
+			tid tid;
+		BEGIN
+			FOR i IN 1..5000 LOOP
+				INSERT INTO t VALUES (i) RETURNING ctid INTO tid;
+				IF tid = '(1,1)' THEN
+					RETURN;
+				END IF;
+			END LOOP;
+			RAISE 'expected to insert tuple to (1,1)';
+	   END;
+   $$;
+}
+teardown
+{
+	DROP TABLE t;
+	DROP EXTENSION injection_points;
+}
+
+session s1
+step s1begin	{ BEGIN; }
+step s1update	{ UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid; }
+step s1abort	{ ABORT; }
+step vacuum		{ VACUUM t; }
+
+# Insert a new tuple, and update it.
+step reinsert	{
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+}
+
+# Same as the 'reinsert' step, but for extra confusion, we also stamp
+# the original tuple with the same 'xmax' as the re-inserted one.
+step reinsert_and_lock {
+	BEGIN;
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+	COMMIT;
+	UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+}
+
+step wake	{
+	SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+	SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+}
+
+session s2
+setup	{
+	SELECT FROM injection_points_set_local();
+	SELECT FROM injection_points_attach('heap_lock_updated_tuple', 'wait');
+}
+step s2lock	{ select * from t where id = 1 for update; }
+
+permutation
+	# Begin transaction, update a row. Because of how we set up the
+	# test table, the updated tuple lands at (1,2)
+	s1begin
+	s1update
+
+	# While the updating transaction is open, start a new session that
+	# tries to lock the row. This blocks on the open transaction.
+	s2lock
+
+	# Abort the updating transaction. This unblocks session 2, but it
+	# will immediately hit the injection point and block on that.
+	s1abort
+	# Vacuum away the updated, aborted tuple.
+	vacuum
+
+	# Insert a new tuple. It lands at the same location where the
+	# updated tuple was.
+	reinsert
+
+	# Let the locking transaction continue. It should lock the
+	# original tuple, ignoring the re-inserted tuple.
+	wake(s2lock)
+
+# Variant where the re-inserted tuple is also locked by the inserting
+# transaction. This failed an earlier version of the fix during
+# development.
+permutation
+	s1begin
+	s1update
+	s2lock
+	s1abort
+	vacuum
+	reinsert_and_lock
+	wake(s2lock)