From: Heikki Linnakangas Date: Wed, 3 Dec 2025 17:15:08 +0000 (+0200) Subject: Set next multixid's offset when creating a new multixid X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=8cfb174a61ebc9316f8d5c4e8c9ad941acdb3849;p=postgresql.git Set next multixid's offset when creating a new multixid With this commit, the next multixid's offset will always be set on the offsets page, by the time that a backend might try to read it, so we no longer need the waiting mechanism with the condition variable. In other words, this eliminates "corner case 2" mentioned in the comments. The waiting mechanism was broken in a few scenarios: - When nextMulti was advanced without WAL-logging the next multixid. For example, if a later multixid was already assigned and WAL-logged before the previous one was WAL-logged, and then the server crashed. In that case the next offset would never be set in the offsets SLRU, and a query trying to read it would get stuck waiting for it. Same thing could happen if pg_resetwal was used to forcibly advance nextMulti. - In hot standby mode, a deadlock could happen where one backend waits for the next multixid assignment record, but WAL replay is not advancing because of a recovery conflict with the waiting backend. The old TAP test used carefully placed injection points to exercise the old waiting code, but now that the waiting code is gone, much of the old test is no longer relevant. Rewrite the test to reproduce the IPC/MultixactCreation hang after crash recovery instead, and to verify that previously recorded multixids stay readable. Backpatch to all supported versions. In back-branches, we still need to be able to read WAL that was generated before this fix, so in the back-branches this includes a hack to initialize the next offsets page when replaying XLOG_MULTIXACT_CREATE_ID for the last multixid on a page. On 'master', bump XLOG_PAGE_MAGIC instead to indicate that the WAL is not compatible. Author: Andrey Borodin Reviewed-by: Dmitry Yurichev Reviewed-by: Álvaro Herrera Reviewed-by: Kirill Reshke Reviewed-by: Ivan Bykov Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/172e5723-d65f-4eec-b512-14beacb326ce@yandex.ru Backpatch-through: 14 --- diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 136065125ea..1caba9a140d 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -337,6 +337,9 @@ static MemoryContext MXactContext = NULL; #define debug_elog6(a,b,c,d,e,f) #endif +/* hack to deal with WAL generated with older minor versions */ +static int pre_initialized_offsets_page = -1; + /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, @@ -868,13 +871,61 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int entryno; int slotno; MultiXactOffset *offptr; - int i; + MultiXactId next; + int next_pageno; + int next_entryno; + MultiXactOffset *next_offptr; LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + /* position of this multixid in the offsets SLRU area */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* position of the next multixid */ + next = multi + 1; + if (next < FirstMultiXactId) + next = FirstMultiXactId; + next_pageno = MultiXactIdToOffsetPage(next); + next_entryno = MultiXactIdToOffsetEntry(next); + + /* + * Older minor versions didn't set the next multixid's offset in this + * function, and therefore didn't initialize the next page until the next + * multixid was assigned. If we're replaying WAL that was generated by + * such a version, the next page might not be initialized yet. Initialize + * it now. + */ + if (InRecovery && + next_pageno != pageno && + MultiXactOffsetCtl->shared->latest_page_number == pageno) + { + elog(DEBUG1, "next offsets page is not initialized, initializing it now"); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + /* + * Remember that we initialized the page, so that we don't zero it + * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record. + */ + pre_initialized_offsets_page = next_pageno; + } + + /* + * Set the starting offset of this multixid's members. + * + * In the common case, it was already be set by the previous + * RecordNewMultiXact call, as this was the next multixid of the previous + * multixid. But if multiple backends are generating multixids + * concurrently, we might race ahead and get called before the previous + * multixid. + */ + /* * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" * to complain about if there's any I/O error. This is kinda bogus, but @@ -886,9 +937,37 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + if (*offptr != offset) + { + /* should already be set to the correct value, or not at all */ + Assert(*offptr == 0); + *offptr = offset; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + /* + * Set the next multixid's offset to the end of this multixid's members. + */ + if (next_pageno == pageno) + { + next_offptr = offptr + 1; + } + else + { + /* must be the first entry on the page */ + Assert(next_entryno == 0 || next == FirstMultiXactId); + slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + next_offptr += next_entryno; + } + + if (*next_offptr != offset + nmembers) + { + /* should already be set to the correct value, or not at all */ + Assert(*next_offptr == 0); + *next_offptr = offset + nmembers; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } /* Exchange our lock */ LWLockRelease(MultiXactOffsetSLRULock); @@ -897,7 +976,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, prev_pageno = -1; - for (i = 0; i < nmembers; i++, offset++) + for (int i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; uint32 *flagsptr; @@ -1072,8 +1151,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) result = FirstMultiXactId; } - /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + /* + * Make sure there is room for the next MXID in the file. Assigning this + * MXID sets the next MXID's offset already. + */ + ExtendMultiXactOffset(result + 1); /* * Reserve the members space, similarly to above. Also, be careful not to @@ -1314,21 +1396,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * one's. However, there are some corner cases to worry about: * * 1. This multixact may be the latest one created, in which case there is - * no next one to look at. In this case the nextOffset value we just - * saved is the correct endpoint. - * - * 2. The next multixact may still be in process of being filled in: that - * is, another process may have done GetNewMultiXactId but not yet written - * the offset entry for that ID. In that scenario, it is guaranteed that - * the offset entry for that multixact exists (because GetNewMultiXactId - * won't release MultiXactGenLock until it does) but contains zero - * (because we are careful to pre-zero offset pages). Because - * GetNewMultiXactId will never return zero as the starting offset for a - * multixact, when we read zero as the next multixact's offset, we know we - * have this case. We sleep for a bit and try again. + * no next one to look at. The next multixact's offset should be set + * already, as we set it in RecordNewMultiXact(), but we used to not do + * that in older minor versions. To cope with that case, if this + * multixact is the latest one created, use the nextOffset value we read + * above as the endpoint. * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset + * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero + * for to mean "unset", there is an ambiguity near the point of offset * wraparound. If we see next multixact's offset is one, is that our * multixact's actual endpoint, or did it end at zero with a subsequent * increment? We handle this using the knowledge that if the zero'th @@ -1340,7 +1415,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ -retry: LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); pageno = MultiXactIdToOffsetPage(multi); @@ -1385,13 +1459,10 @@ retry: nextMXOffset = *offptr; if (nextMXOffset == 0) - { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(MultiXactOffsetSLRULock); - CHECK_FOR_INTERRUPTS(); - pg_usleep(1000L); - goto retry; - } + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid next offset", + multi))); length = nextMXOffset - offset; } @@ -1427,7 +1498,7 @@ retry: if (!TransactionIdIsValid(*xactptr)) { - /* Corner case 3: we must be looking at unused slot zero */ + /* Corner case 2: we must be looking at unused slot zero */ Assert(offset == 0); continue; } @@ -2056,24 +2127,32 @@ TrimMultiXact(void) MultiXactOffsetCtl->shared->latest_page_number = pageno; /* - * Zero out the remainder of the current offsets page. See notes in - * TrimCLOG() for background. Unlike CLOG, some WAL record covers every - * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL - * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * Set the offset of nextMXact on the offsets page. This is normally done + * in RecordNewMultiXact() of the previous multixact, but we used to not + * do that in older minor versions. To ensure that the next offset is set + * if the binary was just upgraded from an older minor version, do it now. + * + * Zero out the remainder of the page. See notes in TrimCLOG() for + * background. Unlike CLOG, some WAL record covers every pg_multixact + * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write + * xlog before data," nextMXact successors may carry obsolete, nonzero + * offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); - if (entryno != 0) { int slotno; MultiXactOffset *offptr; - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + if (entryno == 0) + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + else + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + *offptr = offset; + if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ) + MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset)); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; } @@ -3255,13 +3334,21 @@ multixact_redo(XLogReaderState *record) memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(MultiXactOffsetSLRULock); + /* + * Skip the record if we already initialized the page at the previous + * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact(). + */ + if (pre_initialized_offsets_page != pageno) + { + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + LWLockRelease(MultiXactOffsetSLRULock); + } + else + elog(DEBUG1, "skipping initialization of offsets page %d because it was already initialized on multixid creation", pageno); + pre_initialized_offsets_page = -1; } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { @@ -3285,6 +3372,22 @@ multixact_redo(XLogReaderState *record) TransactionId max_xid; int i; + if (pre_initialized_offsets_page != -1) + { + /* + * If we implicitly initialized the next offsets page while + * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated + * with an older minor version, we still expect to see an + * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other + * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should + * not happen. If it does, we'll continue with the replay, but + * log a message to note that something's funny. + */ + elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page %d that was implicitly initialized earlier", + pre_initialized_offsets_page); + pre_initialized_offsets_page = -1; + } + /* Store the data back into the SLRU files */ RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, xlrec->members);