From 28b7382958272e399f5f611bf0d399e6778c9a59 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 7 Apr 2014 14:51:45 +0300 Subject: [PATCH] Write buffer image before and after every WAL record. --- contrib/page_image_logging/Makefile | 20 + contrib/page_image_logging/README | 50 ++ .../page_image_logging/postprocess-images.c | 624 ++++++++++++++++++ src/backend/access/transam/xloginsert.c | 6 + src/backend/storage/buffer/bufmgr.c | 210 +++++- src/backend/storage/lmgr/lwlock.c | 22 + src/include/miscadmin.h | 10 + src/include/storage/lwlock.h | 1 + 8 files changed, 942 insertions(+), 1 deletion(-) create mode 100644 contrib/page_image_logging/Makefile create mode 100644 contrib/page_image_logging/README create mode 100644 contrib/page_image_logging/postprocess-images.c diff --git a/contrib/page_image_logging/Makefile b/contrib/page_image_logging/Makefile new file mode 100644 index 0000000000..9c68bbcc32 --- /dev/null +++ b/contrib/page_image_logging/Makefile @@ -0,0 +1,20 @@ +# contrib/page_image_logging/Makefile + +PGFILEDESC = "postprocess-images - " + +PROGRAM = postprocess-images +OBJS = postprocess-images.o + +PG_CPPFLAGS = -I$(libpq_srcdir) +PG_LIBS = $(libpq_pgport) + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/postprocess-images +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/page_image_logging/README b/contrib/page_image_logging/README new file mode 100644 index 0000000000..e2f8225aa1 --- /dev/null +++ b/contrib/page_image_logging/README @@ -0,0 +1,50 @@ +Usage +----- + +1. Apply the patch + +2. Set up a master and standby. + +3. stop master, then standby. + +4. Remove $PGDATA/buffer-images from both servers. + +5. Start master and standby + +6. Run "make installcheck", or whatever you want to test + +7. Stop master, then standby + +8. compare the logged page images using the postprocessing tool: + +./postprocess-images ~/data-master/buffer-images ~/data-standby/buffer-images > differences + +9. The 'differences' file should be empty. If not, investigate. + +Tips +---- + +The page images take up a lot of disk space! The PostgreSQL regression +suite generates about 11GB - double that when the same is generated also +in a standby. + +Always stop the master first, then standby. Otherwise, when you restart +the standby, it will start WAL replay from the previous checkpoint, and +log some page images already. Stopping the master creates a checkpoint +record, avoiding the problem. + +If you get errors like this from postprocess-images: + + could not reorder line XXX + +It can be caused by an all-zeros page being logged with XLOG HEAP_NEWPAGE +records. Look at the line in the buffer-image file, see if it's all-zeros. +This can happen e.g when you change the tablespace of a table. See +log_newpage() in xlog.c. + +You can use pg_xlogdump to see which WAL record a page image corresponds +to. But beware that the LSN in the page image points to the *end* of the +WAL record, while the LSN that pg_xlogdump prints is the *beginning* of +the WAL record. So to find which WAL record a page image corresponds to, +find the LSN from the page image in pg_xlogdump output, and back off one +record. (you can't just grep for the line containing the LSN). diff --git a/contrib/page_image_logging/postprocess-images.c b/contrib/page_image_logging/postprocess-images.c new file mode 100644 index 0000000000..f440cfd539 --- /dev/null +++ b/contrib/page_image_logging/postprocess-images.c @@ -0,0 +1,624 @@ +#include "postgres_fe.h" + +typedef uintptr_t Datum; +typedef struct MemoryContextData *MemoryContext; + +#include "access/gin_private.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "storage/bufpage.h" + +#define LINESZ (BLCKSZ*2 + 100) + +/* ---------------------------------------------------------------- + * Masking functions. + * + * Most pages cannot be compared directly, because some parts of the + * page are not expected to be byte-by-byte identical. For example, + * hint bits. Our strategy is to normalize all pages by creating a + * mask of those bits that are not expected to match. + */ + +/* + * Build a mask that covers unused space between pd_lower and pd_upper. + */ +static void +mask_unused_space(char *page, uint8 *mask) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_upper; + + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + fprintf(stderr, "invalid page at %X/%08X\n", + ((PageHeader) page)->pd_lsn.xlogid, + ((PageHeader) page)->pd_lsn.xrecoff); + } + + memset(mask + pd_lower, 0xFF, pd_upper - pd_lower); +} + +static void +build_heap_mask(char *page, uint8 *mask) +{ + OffsetNumber off; + PageHeader mask_phdr = (PageHeader) mask; + + mask_unused_space(page, mask); + + /* Ignore prune_xid (it's like a hint-bit) */ + mask_phdr->pd_prune_xid = 0xFFFFFFFF; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints */ + mask_phdr->pd_flags |= PD_PAGE_FULL | PD_HAS_FREE_LINES; + + /* + * Also mask the all-visible flag. + * + * XXX: It is unfortunate that we have to do this. If the flag is set + * incorrectly, that's serious, and we would like to catch it. If the flag + * is cleared incorrectly, that's serious too. But redo of HEAP_CLEAN + * records don't currently set the flag, even though it is set in the + * master, so we must silence failures that that causes. + */ + mask_phdr->pd_flags |= PD_ALL_VISIBLE; + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + Item item = PageGetItem(page, iid); + char *mask_item; + + mask_item = (char *) (mask + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader mask_htup = (HeapTupleHeader) mask_item; + + /* + * Ignore hint bits and command ID. + */ + mask_htup->t_infomask = + HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | + HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID; + mask_htup->t_infomask |= HEAP_COMBOCID; + mask_htup->t_choice.t_heap.t_field3.t_cid = 0xFFFFFFFF; + + /* + * Ignore speculative insertion token on speculatively + * inserted tuples. + */ + + if (HeapTupleHeaderIsSpeculative((HeapTupleHeader) item)) + { + mask_htup->t_ctid.ip_blkid.bi_hi = 0xFFFF; + mask_htup->t_ctid.ip_blkid.bi_lo = 0xFFFF; + } + } + + /* + * Ignore any padding bytes after the tuple, when the length of + * the item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(mask_item + len, 0xFF, padlen); + } + } +} + +static void +build_spgist_mask(char *page, uint8 *mask) +{ + mask_unused_space(page, mask); +} + +static void +build_gist_mask(char *page, uint8 *mask) +{ + mask_unused_space(page, mask); +} + +static void +build_gin_mask(BlockNumber blkno, char *page, uint8 *mask) +{ + /* GIN metapage doesn't use pd_lower/pd_upper. Other page types do. */ + if (blkno != 0) + { + /* + * Mask everything on a DELETED page. + */ + if (GinPageIsDeleted(page)) + { + /* page content, between standard page header and opaque struct */ + memset(mask + SizeOfPageHeaderData, 0xFF, BLCKSZ - MAXALIGN(sizeof(GinPageOpaqueData))); + + /* pd_lower and upper */ + memset(&((PageHeader) mask)->pd_lower, 0xFF, sizeof(uint16)); + memset(&((PageHeader) mask)->pd_upper, 0xFF, sizeof(uint16)); + } + else + mask_unused_space(page, mask); + } +} + +static void +build_brin_mask(char *page, uint8 *mask) +{ + OffsetNumber off; + PageHeader mask_phdr = (PageHeader) mask; + + mask_unused_space(page, mask); + + /* Ignore PD_HAS_FREE_LINES flag, it is just a hint */ + mask_phdr->pd_flags |= PD_HAS_FREE_LINES; +} + +static void +build_sequence_mask(char *page, uint8 *mask) +{ + /* + * FIXME: currently, we just ignore sequence records altogether. nextval + * records a different value in the WAL record than it writes to the + * buffer. Ideally we would only mask out the value in the tuple. + */ + memset(mask, 0xFF, BLCKSZ); +} + +static void +build_btree_mask(char *page, uint8 *mask) +{ + OffsetNumber off; + OffsetNumber maxoff; + + mask_unused_space(page, mask); + + /* + * Mask everything on a DELETED page. + */ + if (((BTPageOpaque) PageGetSpecialPointer(page))->btpo_flags & BTP_DELETED) + { + /* page content, between standard page header and opaque struct */ + memset(mask + SizeOfPageHeaderData, 0xFF, BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))); + + /* pd_lower and upper */ + memset(&((PageHeader) mask)->pd_lower, 0xFF, sizeof(uint16)); + memset(&((PageHeader) mask)->pd_upper, 0xFF, sizeof(uint16)); + } + else + { + /* Mask DEAD line pointer bits */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = 1; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + ItemIdData m; + + if (ItemIdIsDead(iid)) + { + memset(&m, 0, sizeof(ItemIdData)); + m.lp_flags = 2; + + memcpy((char *) mask + (((char *) iid) - page), &m, sizeof(ItemIdData)); + } + } + } + + /* Mask BTP_HAS_GARBAGE flag and cycle-ID in the opaque struct */ + { + BTPageOpaque maskopaq = (BTPageOpaque) (((char *) mask) + ((PageHeader) page)->pd_special); + + maskopaq->btpo_flags |= BTP_HAS_GARBAGE | BTP_SPLIT_END; + + maskopaq->btpo_cycleid = 0xFFFF; + } +} + +static inline unsigned char +parsehex(char digit, bool *success) +{ + if (digit >= '0' && digit <= '9') + { + *success = true; + return (unsigned char) (digit - '0'); + } + if (digit >= 'a' && digit <= 'f') + { + *success = true; + return (unsigned char) (digit - 'a' + 10); + } + if (digit >= 'A' && digit <= 'F') + { + *success = true; + return (unsigned char) (digit - 'A' + 10); + } + *success = false; + return 0; +} + +static inline void +tohex(uint8 byte, char *out) +{ + const char *digits = "0123456789ABCDEF"; + + out[0] = digits[byte >> 4]; + out[1] = digits[byte & 0x0F]; +} + +/* + * Mask any known changing parts, like hint bits, from the line. The line + * is modified in place. Full nibbles to be ignored are set to 'X' in the + * hex output, and individiual bits are set to 0. + */ +static void +maskline(char *line) +{ + char page[BLCKSZ]; + uint8 mask[BLCKSZ]; + int i; + uint16 tail; + char *pstart; + char *p; + BlockNumber blkno; + + /* Parse the line */ + p = strstr(line, " blk: "); + if (p == NULL) + return; + + sscanf(p, " blk: %u", &blkno); + + pstart = strstr(line, "after: "); + if (pstart == NULL) + return; + pstart += strlen("after: "); + + /* Decode the hex-encoded page back to raw bytes */ + p = pstart; + for (i = 0; i < BLCKSZ; i++) + { + bool success; + unsigned char c; + + c = parsehex(*(p++), &success) << 4; + if (!success) + return; + c |= parsehex(*(p++), &success); + if (!success) + return; + + page[i] = (char) c; + } + + /* + * Ok, we now have the original block contents in 'page'. Look at the + * size of the special area, and the last two bytes in it, to detect + * what kind of a page it is. Call the appropriate masking function. + */ + + /* begin with an empty mask */ + memset(mask, 0, BLCKSZ); + + memcpy(&tail, &page[BLCKSZ - 2], 2); + + /* Try to detect what kind of a page it is */ + if (PageGetSpecialSize(page) == 0) + { + build_heap_mask(page, mask); + } + else if (PageGetSpecialSize(page) == 16) + { + if (tail == 0xFF81) + build_gist_mask(page, mask); + else if (tail <= 0xFF7F) + build_btree_mask(page, mask); + } + else if (PageGetSpecialSize(page) == 8) + { + if (tail == 0xFF82) + build_spgist_mask(page, mask); + else if (tail == 0xF093) + build_brin_mask(page, mask); + else if (*((uint32 *) (page + BLCKSZ - MAXALIGN(sizeof(uint32)))) == 0x1717) + build_sequence_mask(page, mask); + else + build_gin_mask(blkno, page, mask); + } + + /* Apply the mask, replacing masked nibbles with # */ + for (i = 0; i < BLCKSZ; i++) + { + uint8 c; + + if (mask[i] == 0) + continue; + + c = ((uint8) page[i]) & ~mask[i]; + + tohex(c, &pstart[2 * i]); + + if ((mask[i] & 0xF0) == 0xF0) + pstart[2 * i] = '#'; + if ((mask[i] & 0x0F) == 0x0F) + pstart[2 * i + 1] = '#'; + } +} + + + +/* ---------------------------------------------------------------- + * Line reordering + * + * When the page images are logged in master and standby, they are + * not necessarily written out in the same order. For example, if a + * single WAL-logged operation modifies multiple pages, like an index + * page split, the standby might release the locks in different order + * than the master. Another cause is concurrent operations; writing + * the page images is not atomic with WAL insertion, so if two + * backends are running concurrently, their modifications in the + * image log can be interleaved in different order than their WAL + * records. + * + * To fix that, the lines are read into a reorder buffer, and sorted + * there. Sorting the whole file would be overkill, as the lines are + * mostly in order already. The fixed-size reorder buffer works as + * long as the lines are not out-of-order by more than + * REORDER_BUFFER_SIZE lines. + */ + +#define REORDER_BUFFER_SIZE 1000 + +typedef struct +{ + char *lines[REORDER_BUFFER_SIZE]; + int nlines; /* number of lines currently in buffer */ + + FILE *fp; + int lineno; /* current input line number (for debugging) */ + bool eof; /* have we reached EOF from this source? */ +} reorder_buffer; + +/* + * Read lines from the file into the reorder buffer, until the buffer is full. + */ +static void +fill_reorder_buffer(reorder_buffer *buf) +{ + if (buf->eof) + return; + + while (buf->nlines < REORDER_BUFFER_SIZE) + { + char *linebuf = pg_malloc(LINESZ); + + if (fgets(linebuf, LINESZ, buf->fp) == NULL) + { + buf->eof = true; + pg_free(linebuf); + break; + } + buf->lineno++; + + /* common case: the new line goes to the end */ + if (buf->nlines == 0 || + strcmp(linebuf, buf->lines[buf->nlines - 1]) >= 0) + { + buf->lines[buf->nlines] = linebuf; + } + else + { + /* find the right place in the queue */ + int i; + + for (i = buf->nlines - 2; i >= 0; i--) + { + if (strcmp(linebuf, buf->lines[i]) >= 0) + break; + } + if (i < 0) + { + fprintf(stderr, "could not reorder line %d\n", buf->lineno); + pg_free(linebuf); + continue; + } + i++; + memmove(&buf->lines[i + 1], &buf->lines[i], + (buf->nlines - i) * sizeof(char *)); + buf->lines[i] = linebuf; + } + buf->nlines++; + } +} + +static reorder_buffer * +init_reorder_buffer(FILE *fp) +{ + reorder_buffer *buf; + + buf = pg_malloc(sizeof(reorder_buffer)); + buf->fp = fp; + buf->eof = false; + buf->lineno = 0; + buf->nlines = 0; + + fill_reorder_buffer(buf); + + return buf; +} + +/* + * Read all the lines that belong to the next WAL record from the reorder + * buffer. + */ +static int +readrecord(reorder_buffer *buf, char **lines, uint64 *lsn) +{ + int nlines; + uint32 line_xlogid; + uint32 line_xrecoff; + uint64 line_lsn; + uint64 rec_lsn; + + /* Get all the lines with the same LSN */ + for (nlines = 0; nlines < buf->nlines; nlines++) + { + sscanf(buf->lines[nlines], "LSN: %X/%08X", &line_xlogid, &line_xrecoff); + line_lsn = ((uint64) line_xlogid) << 32 | (uint64) line_xrecoff; + + if (nlines == 0) + *lsn = rec_lsn = line_lsn; + else + { + if (line_lsn != rec_lsn) + break; + } + } + if (nlines == buf->nlines) + { + if (!buf->eof) + { + fprintf(stderr, "max number of lines in record reached, LSN: %X/%08X\n", + line_xlogid, line_xrecoff); + exit(1); + } + } + + /* consume the lines from the reorder buffer */ + memcpy(lines, buf->lines, sizeof(char *) * nlines); + memmove(&buf->lines[0], &buf->lines[nlines], + sizeof(char *) * (buf->nlines - nlines)); + buf->nlines -= nlines; + + fill_reorder_buffer(buf); + + return nlines; +} + +static void +freerecord(char **lines, int nlines) +{ + int i; + + for (i = 0; i < nlines; i++) + pg_free(lines[i]); +} + +static void +printrecord(char **lines, int nlines) +{ + int i; + + for (i = 0; i < nlines; i++) + printf("%s", lines[i]); +} + +static bool +diffrecords(char **lines_a, int nlines_a, char **lines_b, int nlines_b) +{ + int i; + + if (nlines_a != nlines_b) + return true; + for (i = 0; i < nlines_a; i++) + { + /* First try a straight byte-per-byte comparison. */ + if (strcmp(lines_a[i], lines_b[i]) != 0) + { + /* They were not byte-per-byte identical. Try again after masking */ + maskline(lines_a[i]); + maskline(lines_b[i]); + if (strcmp(lines_a[i], lines_b[i]) != 0) + return true; + } + } + + return false; +} + +static void +usage(void) +{ + printf("usage: postprocess-images \n"); +} + +int +main(int argc, char **argv) +{ + char *lines_a[REORDER_BUFFER_SIZE]; + int nlines_a; + char *lines_b[REORDER_BUFFER_SIZE]; + int nlines_b; + FILE *fp_a; + FILE *fp_b; + uint64 lsn_a; + uint64 lsn_b; + reorder_buffer *buf_a; + reorder_buffer *buf_b; + + if (argc != 3) + { + usage(); + exit(1); + } + + fp_a = fopen(argv[1], "rb"); + if (fp_a == NULL) + { + fprintf(stderr, "could not open file \"%s\"\n", argv[1]); + exit(1); + } + fp_b = fopen(argv[2], "rb"); + if (fp_b == NULL) + { + fprintf(stderr, "could not open file \"%s\"\n", argv[2]); + exit(1); + } + + buf_a = init_reorder_buffer(fp_a); + buf_b = init_reorder_buffer(fp_b); + + /* read first record from both */ + nlines_a = readrecord(buf_a, lines_a, &lsn_a); + nlines_b = readrecord(buf_b, lines_b, &lsn_b); + + while (nlines_a > 0 || nlines_b > 0) + { + /* compare the records */ + if (lsn_a < lsn_b || nlines_b == 0) + { + printf("Only in A:\n"); + printrecord(lines_a, nlines_a); + freerecord(lines_a, nlines_a); + nlines_a = readrecord(buf_a, lines_a, &lsn_a); + } + else if (lsn_a > lsn_b || nlines_a == 0) + { + printf("Only in B:\n"); + printrecord(lines_b, nlines_b); + freerecord(lines_b, nlines_b); + nlines_b = readrecord(buf_b, lines_b, &lsn_b); + } + else if (lsn_a == lsn_b) + { + if (diffrecords(lines_a, nlines_a, lines_b, nlines_b)) + { + printf("lines differ, A:\n"); + printrecord(lines_a, nlines_a); + printf("B:\n"); + printrecord(lines_b, nlines_b); + } + freerecord(lines_a, nlines_a); + freerecord(lines_b, nlines_b); + nlines_a = readrecord(buf_a, lines_a, &lsn_a); + nlines_b = readrecord(buf_b, lines_b, &lsn_b); + } + } + + return 0; +} diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index abd8420da6..fa82f30e5d 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -957,6 +957,12 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, PageSetLSN(page, recptr); } + /* + * the normal mechanism that hooks into LockBuffer doesn't work for this, + * because we're bypassing buffer manager. + */ + log_page_change(page, rnode, forkNum, blkno); + return recptr; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index cd3aaad610..b06973f9e1 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -414,6 +414,154 @@ static void AtProcExit_Buffers(int code, Datum arg); static void CheckForBufferLeaks(void); static int rnode_comparator(const void *p1, const void *p2); +/* Support for capturing changes to pages */ +typedef struct +{ + Buffer buffer; + char content[BLCKSZ]; +} BufferImage; + +#define MAX_BEFORE_IMAGES 100 +static BufferImage *before_images[MAX_BEFORE_IMAGES]; +int before_images_cnt = 0; + +static FILE *imagefp; + +static bool +log_before_after_images(char *msg, BufferImage *img) +{ + Page newcontent = BufferGetPage(img->buffer); + Page oldcontent = (Page) img->content; + XLogRecPtr oldlsn; + XLogRecPtr newlsn; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + + oldlsn = PageGetLSN(oldcontent); + newlsn = PageGetLSN(newcontent); + + if (oldlsn == newlsn) + { + /* no change */ + return false; + } + + BufferGetTag(img->buffer, &rnode, &forknum, &blkno); + + log_page_change(newcontent, &rnode, forknum, blkno); + + return true; +} + +void +log_page_change(char *newcontent, RelFileNode *rnode, int forknum, uint32 blkno) +{ + XLogRecPtr newlsn = PageGetLSN((Page) newcontent); + int i; + + /* + * We need a lock to make sure that only one backend writes to the file + * at a time. Abuse SyncScanLock for that - it happens to never be used + * while a buffer is locked/unlocked. + */ + LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); + + fprintf(imagefp, "LSN: %X/%08X, rel: %u/%u/%u, blk: %u; ", + (uint32) (newlsn >> 32), (uint32) newlsn, + rnode->spcNode, rnode->dbNode, rnode->relNode, + blkno); + if (forknum != MAIN_FORKNUM) + fprintf(imagefp, "forknum: %u; ", forknum); + + /* write the page contents, in hex */ + { + char buf[BLCKSZ * 2]; + int j = 0; + + fprintf(imagefp, "after: "); + for (i = 0; i < BLCKSZ; i++) + { + const char *digits = "0123456789ABCDEF"; + uint8 byte = (uint8) newcontent[i]; + + buf[j++] = digits[byte >> 4]; + buf[j++] = digits[byte & 0x0F]; + } + fwrite(buf, BLCKSZ * 2, 1, imagefp); + } + + fprintf(imagefp, "\n"); + fflush(imagefp); + + LWLockRelease(SyncScanLock); +} + +static void +remember_before_image(Buffer buffer) +{ + BufferImage *img; + + Assert(before_images_cnt < MAX_BEFORE_IMAGES); + + img = before_images[before_images_cnt]; + img->buffer = buffer; + memcpy (img->content, BufferGetPage(buffer), BLCKSZ); + before_images_cnt++; +} + +/* + * Forget a buffer image. If the page was modified, log the new contents. + */ +static void +forget_image(Buffer buffer) +{ + int i; + + for (i = 0; i < before_images_cnt; i++) + { + BufferImage *img = before_images[i]; + + if (img->buffer == buffer) + { + + log_before_after_images("forget", before_images[i]); + if (i != before_images_cnt) + { + /* swap the last still-used slot with this one */ + before_images[i] = before_images[before_images_cnt - 1]; + before_images[before_images_cnt - 1] = img; + } + before_images_cnt--; + + return; + } + } + elog(LOG, "could not find image for buffer %u", buffer); +} + +/* + * See if any of the buffers we've memorized have changed. + */ +void +log_page_changes(char *msg) +{ + int i; + + for (i = 0; i < before_images_cnt; i++) + { + BufferImage *img = before_images[i]; + + /* + * Print the contents of the page if it was changed. Remember the + * new contents as the current image. + */ + if (log_before_after_images(msg, img)) + { + memcpy(img->content, BufferGetPage(img->buffer), BLCKSZ); + } + } +} /* * PrefetchBuffer -- initiate asynchronous read of a block of a relation @@ -677,7 +825,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (!isLocalBuf) { if (mode == RBM_ZERO_AND_LOCK) + { LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + remember_before_image(BufferDescriptorGetBuffer(bufHdr)); + } else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); } @@ -819,6 +970,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, !isLocalBuf) { LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + remember_before_image(BufferDescriptorGetBuffer(bufHdr)); } if (isLocalBuf) @@ -2077,6 +2229,10 @@ AtEOXact_Buffers(bool isCommit) { CheckForBufferLeaks(); + if (before_images_cnt > 0) + elog(LOG, "released all page-images (AtEOXact_Buffers)"); + before_images_cnt = 0; + AtEOXact_LocalBuffers(isCommit); Assert(PrivateRefCountOverflowed == 0); @@ -2121,7 +2277,18 @@ InitBufferPoolAccess(void) void InitBufferPoolBackend(void) { + int i; + BufferImage *images; + on_shmem_exit(AtProcExit_Buffers, 0); + + /* Initialize page image capturing */ + images = palloc(MAX_BEFORE_IMAGES * sizeof(BufferImage)); + + for (i = 0; i < MAX_BEFORE_IMAGES; i++) + before_images[i] = &images[i]; + + imagefp = fopen("buffer-images", "ab"); } /* @@ -3107,7 +3274,30 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * sometime later in this checkpoint cycle. */ if (!XLogRecPtrIsInvalid(lsn)) + { PageSetLSN(page, lsn); + + /* + * the normal mechanism that hooks into LockBuffer doesn't work + * for this, if we're only holding a SHARE lock on the buffer. + */ + if (!LWLockHeldByMeInExclusiveMode(bufHdr->content_lock)) + { + RelFileNode rnode; + ForkNumber forkno; + BlockNumber blkno; + + /* + * XXX: we pass the original page to log_page_change, + * ignoring the possibility that the buffer is modified + * concurrently by another hint bit update. That's OK, + * the comparison tool ignores differences in hint bits + * anyway. + */ + BufferGetTag(buffer, &rnode, &forkno, &blkno); + log_page_change(BufferGetPage(buffer), &rnode, forkno, blkno); + } + } } bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); UnlockBufHdr(bufHdr); @@ -3171,6 +3361,16 @@ LockBuffer(Buffer buffer, int mode) buf = GetBufferDescriptor(buffer - 1); + /* + * If we were holding it in exclusive-mode, log any changes we made to + * it while holding the log in the page-image log. + */ + if (mode == BUFFER_LOCK_UNLOCK && + LWLockHeldByMeInExclusiveMode(buf->content_lock)) + { + forget_image(buffer); + } + if (mode == BUFFER_LOCK_UNLOCK) LWLockRelease(buf->content_lock); else if (mode == BUFFER_LOCK_SHARE) @@ -3179,6 +3379,9 @@ LockBuffer(Buffer buffer, int mode) LWLockAcquire(buf->content_lock, LW_EXCLUSIVE); else elog(ERROR, "unrecognized buffer lock mode: %d", mode); + + if (mode == BUFFER_LOCK_EXCLUSIVE) + remember_before_image(buffer); } /* @@ -3190,6 +3393,7 @@ bool ConditionalLockBuffer(Buffer buffer) { volatile BufferDesc *buf; + bool res; Assert(BufferIsValid(buffer)); if (BufferIsLocal(buffer)) @@ -3197,7 +3401,11 @@ ConditionalLockBuffer(Buffer buffer) buf = GetBufferDescriptor(buffer - 1); - return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE); + res = LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE); + if (res) + remember_before_image(buffer); + + return res; } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 687ed6399c..a98b35d5a1 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -1629,6 +1629,10 @@ LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val) void LWLockReleaseAll(void) { + if (before_images_cnt > 0) + elog(LOG, "released all page images"); + before_images_cnt = 0; + while (num_held_lwlocks > 0) { HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ @@ -1656,3 +1660,21 @@ LWLockHeldByMe(LWLock *l) } return false; } + +bool +LWLockHeldByMeInExclusiveMode(LWLock *l) +{ + int i; + + for (i = 0; i < num_held_lwlocks; i++) + { + if (held_lwlocks[i].lock == l) + { + if (held_lwlocks[i].mode == LW_EXCLUSIVE) + return true; + else + return false; + } + } + return false; +} diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index e0cc69f27e..50692ea51a 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -91,6 +91,14 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount; /* in tcop/postgres.c */ extern void ProcessInterrupts(void); +/* in bufmgr.c, related to capturing page images */ +extern void log_page_changes(char *msg); +struct RelFileNode; +extern void log_page_change(char *newcontent, struct RelFileNode *rnode, int forknum, uint32 blkno); + +extern int before_images_cnt; + + #ifndef WIN32 #define CHECK_FOR_INTERRUPTS() \ @@ -132,6 +140,8 @@ do { \ do { \ Assert(CritSectionCount > 0); \ CritSectionCount--; \ + if (CritSectionCount == 0) \ + log_page_changes("END_CRIT_SECTION"); \ } while(0) diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index f2ff6a06f0..6e9ad48dfa 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -189,6 +189,7 @@ extern void LWLockRelease(LWLock *lock); extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val); extern void LWLockReleaseAll(void); extern bool LWLockHeldByMe(LWLock *lock); +extern bool LWLockHeldByMeInExclusiveMode(LWLock *lock); extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval); extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value); -- 2.39.5