aio: --- BASE PATCH -- (to-be-split).
authorAndres Freund <[email protected]>
Thu, 29 Oct 2020 19:41:12 +0000 (12:41 -0700)
committerAndres Freund <[email protected]>
Mon, 11 Jan 2021 23:09:15 +0000 (15:09 -0800)
40 files changed:
contrib/pg_prewarm/pg_prewarm.c
src/backend/access/heap/hio.c
src/backend/access/transam/xact.c
src/backend/access/transam/xlog.c
src/backend/bootstrap/bootstrap.c
src/backend/catalog/system_views.sql
src/backend/postmaster/checkpointer.c
src/backend/postmaster/pgstat.c
src/backend/postmaster/postmaster.c
src/backend/storage/buffer/buf_init.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/file/fd.c
src/backend/storage/ipc/Makefile
src/backend/storage/ipc/aio.c [new file with mode: 0644]
src/backend/storage/ipc/aio_util.c [new file with mode: 0644]
src/backend/storage/ipc/ipci.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/lwlocknames.txt
src/backend/storage/lmgr/proc.c
src/backend/storage/page/bufpage.c
src/backend/storage/smgr/md.c
src/backend/storage/smgr/smgr.c
src/backend/tcop/postgres.c
src/backend/utils/init/miscinit.c
src/backend/utils/misc/guc.c
src/backend/utils/resowner/resowner.c
src/include/access/xlog_internal.h
src/include/catalog/pg_proc.dat
src/include/pgstat.h
src/include/storage/aio.h [new file with mode: 0644]
src/include/storage/buf_internals.h
src/include/storage/bufmgr.h
src/include/storage/bufpage.h
src/include/storage/fd.h
src/include/storage/lwlock.h
src/include/storage/md.h
src/include/storage/smgr.h
src/include/utils/resowner_private.h
src/test/regress/expected/rules.out

index a8554529361e64d108ed37c0f61c25c542e7560e..93af05cdf937840c2429f8be30b77a4da03b84c7 100644 (file)
@@ -18,7 +18,9 @@
 #include "access/relation.h"
 #include "fmgr.h"
 #include "miscadmin.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
+#include "storage/bufpage.h"
 #include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
@@ -33,11 +35,116 @@ typedef enum
 {
    PREWARM_PREFETCH,
    PREWARM_READ,
-   PREWARM_BUFFER
+   PREWARM_READ_AIO,
+   PREWARM_BUFFER,
+   PREWARM_BUFFER_AIO
 } PrewarmType;
 
 static PGAlignedBlock blockbuffer;
 
+typedef struct prefetch
+{
+   Relation rel;
+   ForkNumber forkNumber;
+   int64 curblock;
+   int64 lastblock;
+   List *bbs;
+} prefetch;
+
+static PgStreamingReadNextStatus
+prewarm_buffer_next(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private)
+{
+   prefetch *p = (prefetch *) pgsr_private;
+   Buffer buf;
+   bool already_valid;
+   BlockNumber blockno;
+
+   if (p->curblock <= p->lastblock)
+       blockno = p->curblock++;
+   else
+       return PGSR_NEXT_END;
+
+   buf = ReadBufferAsync(p->rel, p->forkNumber, blockno,
+                         RBM_NORMAL, NULL, &already_valid,
+                         &aio);
+
+   *read_private = (uintptr_t) buf;
+
+   if (already_valid)
+   {
+       ereport(DEBUG3,
+               errmsg("pgsr %s: found block %d already in buf %d",
+                      NameStr(p->rel->rd_rel->relname),
+                      blockno, buf),
+               errhidestmt(true),
+               errhidecontext(true));
+       return PGSR_NEXT_NO_IO;
+   }
+   else
+   {
+       ereport(DEBUG3,
+               errmsg("pgsr %s: fetching block %d into buf %d",
+                      NameStr(p->rel->rd_rel->relname),
+                      blockno, buf),
+               errhidestmt(true),
+               errhidecontext(true));
+       return PGSR_NEXT_IO;
+   }
+}
+
+static void
+prewarm_buffer_release(uintptr_t pgsr_private, uintptr_t read_private)
+{
+   prefetch *p = (prefetch *) pgsr_private;
+   Buffer buf = (Buffer) read_private;
+
+   ereport(DEBUG2,
+           errmsg("pgsr %s: releasing buf %d",
+                  NameStr(p->rel->rd_rel->relname),
+                  buf),
+           errhidestmt(true),
+           errhidecontext(true));
+
+   Assert(BufferIsValid(buf));
+   ReleaseBuffer(buf);
+}
+
+
+static PgStreamingReadNextStatus
+prewarm_smgr_next(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private)
+{
+   prefetch *p = (prefetch *) pgsr_private;
+   BlockNumber blockno;
+   PgAioBounceBuffer *bb;
+
+   if (p->curblock <= p->lastblock)
+       blockno = p->curblock++;
+   else
+       return PGSR_NEXT_END;
+
+   if (p->bbs != NIL)
+   {
+       bb = lfirst(list_tail(p->bbs));
+       p->bbs = list_delete_last(p->bbs);
+   }
+   else
+       bb = pgaio_bounce_buffer_get();
+
+   pgaio_assoc_bounce_buffer(aio, bb);
+
+   smgrstartread(aio, p->rel->rd_smgr, p->forkNumber, blockno,
+                 pgaio_bounce_buffer_buffer(bb), InvalidBuffer, 0);
+
+   *read_private = (uintptr_t) bb;
+
+   return PGSR_NEXT_IO;
+}
+
+static void
+prewarm_smgr_release(uintptr_t pgsr_private, uintptr_t read_private)
+{
+}
+
 /*
  * pg_prewarm(regclass, mode text, fork text,
  *           first_block int8, last_block int8)
@@ -86,6 +193,10 @@ pg_prewarm(PG_FUNCTION_ARGS)
        ptype = PREWARM_READ;
    else if (strcmp(ttype, "buffer") == 0)
        ptype = PREWARM_BUFFER;
+   else if (strcmp(ttype, "buffer_aio") == 0)
+       ptype = PREWARM_BUFFER_AIO;
+   else if (strcmp(ttype, "read_aio") == 0)
+       ptype = PREWARM_READ_AIO;
    else
    {
        ereport(ERROR,
@@ -197,6 +308,82 @@ pg_prewarm(PG_FUNCTION_ARGS)
            ++blocks_done;
        }
    }
+   else if (ptype == PREWARM_BUFFER_AIO)
+   {
+       PgStreamingRead *pgsr;
+       prefetch p;
+
+       p.rel = rel;
+       p.forkNumber = forkNumber;
+       p.curblock = 0;
+       p.lastblock = last_block;
+       p.bbs = NIL;
+
+       pgsr = pg_streaming_read_alloc(512, (uintptr_t) &p,
+                                      prewarm_buffer_next,
+                                      prewarm_buffer_release);
+
+       for (block = first_block; block <= last_block; ++block)
+       {
+           Buffer      buf;
+
+           CHECK_FOR_INTERRUPTS();
+
+           buf = (Buffer) pg_streaming_read_get_next(pgsr);
+           if (BufferIsValid(buf))
+               ReleaseBuffer(buf);
+           else
+               elog(ERROR, "prefetch ended early");
+
+           ++blocks_done;
+       }
+
+       if (BufferIsValid(pg_streaming_read_get_next(pgsr)))
+           elog(ERROR, "unexpected additional buffer");
+
+       pg_streaming_read_free(pgsr);
+   }
+   else if (ptype == PREWARM_READ_AIO)
+   {
+       PgStreamingRead *pgsr;
+       prefetch p;
+       ListCell *lc;
+
+       p.rel = rel;
+       p.forkNumber = forkNumber;
+       p.curblock = 0;
+       p.lastblock = last_block;
+       p.bbs = NIL;
+
+       pgsr = pg_streaming_read_alloc(512, (uintptr_t) &p,
+                                      prewarm_smgr_next,
+                                      prewarm_smgr_release);
+
+       for (block = first_block; block <= last_block; ++block)
+       {
+           PgAioBounceBuffer *bb;
+
+           CHECK_FOR_INTERRUPTS();
+
+           bb = (PgAioBounceBuffer *) pg_streaming_read_get_next(pgsr);
+           if (bb == NULL)
+               elog(ERROR, "prefetch ended early");
+
+           p.bbs = lappend(p.bbs, (void *) bb);
+
+           ++blocks_done;
+       }
+
+       if (pg_streaming_read_get_next(pgsr) != 0)
+           elog(ERROR, "unexpected additional buffer");
+
+       pg_streaming_read_free(pgsr);
+
+       foreach(lc, p.bbs)
+       {
+           pgaio_bounce_buffer_release(lfirst(lc));
+       }
+   }
 
    /* Close relation, release lock. */
    relation_close(rel, AccessShareLock);
index fac3b8e9ff28731e99f861f7048bc2718df96b26..69af7e8762876db7180278aeb972cfda47518539 100644 (file)
@@ -129,6 +129,75 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock,
    return buffer;
 }
 
+static Buffer
+ExtendRelation(Relation relation, BulkInsertState bistate, bool use_fsm)
+{
+   Buffer buf;
+   Page        page;
+
+   /* FIXME: There should be a better approach to both of these exceptions */
+   if (RELATION_IS_LOCAL(relation) || !use_fsm)
+   {
+       LockRelationForExtension(relation, ExclusiveLock);
+       buf = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
+       UnlockRelationForExtension(relation, ExclusiveLock);
+   }
+   else
+   {
+       int extendby;
+       int newblockno;
+
+       /*
+        * Determine number of pages to extend relation by.
+        */
+       {
+           BlockNumber start_nblocks;
+
+           RelationOpenSmgr(relation);
+           start_nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+           extendby = Max(Min(start_nblocks / 16 * BLCKSZ, (16 * 1024 * 1024)) / BLCKSZ, 1);
+           extendby = Max(Min(extendby, NBuffers / 128), 1);
+       }
+
+       /*
+        * FIXME: There should probably be some chunking, either from here, or
+        * inside BulkExtendBuffered.
+        */
+       buf = BulkExtendBuffered(relation, MAIN_FORKNUM, extendby,
+                                bistate ? bistate->strategy : NULL);
+
+       newblockno = BufferGetBlockNumber(buf);
+
+       for (int i = newblockno + 1; i < newblockno + extendby; i++)
+       {
+           RecordPageWithFreeSpace(relation, i, BLCKSZ - SizeOfPageHeaderData);
+       }
+
+       if (use_fsm && extendby > 1)
+       {
+           FreeSpaceMapVacuumRange(relation, newblockno, newblockno + extendby);
+       }
+   }
+
+   /*
+    * We need to initialize the empty new page.  Double-check that it really
+    * is empty (this should never happen, but if it does we don't want to
+    * risk wiping out valid data).
+    */
+   page = BufferGetPage(buf);
+
+   if (!PageIsNew(page))
+       elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
+            BufferGetBlockNumber(buf),
+            RelationGetRelationName(relation));
+
+   PageInit(page, BufferGetPageSize(buf), 0);
+   MarkBufferDirty(buf);
+
+   return buf;
+}
+
+
 /*
  * For each heap page which is all-visible, acquire a pin on the appropriate
  * visibility map page, if we haven't already got one.
@@ -187,90 +256,6 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
    }
 }
 
-/*
- * Extend a relation by multiple blocks to avoid future contention on the
- * relation extension lock.  Our goal is to pre-extend the relation by an
- * amount which ramps up as the degree of contention ramps up, but limiting
- * the result to some sane overall value.
- */
-static void
-RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
-{
-   BlockNumber blockNum,
-               firstBlock = InvalidBlockNumber;
-   int         extraBlocks;
-   int         lockWaiters;
-
-   /* Use the length of the lock wait queue to judge how much to extend. */
-   lockWaiters = RelationExtensionLockWaiterCount(relation);
-   if (lockWaiters <= 0)
-       return;
-
-   /*
-    * It might seem like multiplying the number of lock waiters by as much as
-    * 20 is too aggressive, but benchmarking revealed that smaller numbers
-    * were insufficient.  512 is just an arbitrary cap to prevent
-    * pathological results.
-    */
-   extraBlocks = Min(512, lockWaiters * 20);
-
-   do
-   {
-       Buffer      buffer;
-       Page        page;
-       Size        freespace;
-
-       /*
-        * Extend by one page.  This should generally match the main-line
-        * extension code in RelationGetBufferForTuple, except that we hold
-        * the relation extension lock throughout, and we don't immediately
-        * initialize the page (see below).
-        */
-       buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
-       page = BufferGetPage(buffer);
-
-       if (!PageIsNew(page))
-           elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
-                BufferGetBlockNumber(buffer),
-                RelationGetRelationName(relation));
-
-       /*
-        * Add the page to the FSM without initializing. If we were to
-        * initialize here, the page would potentially get flushed out to disk
-        * before we add any useful content. There's no guarantee that that'd
-        * happen before a potential crash, so we need to deal with
-        * uninitialized pages anyway, thus avoid the potential for
-        * unnecessary writes.
-        */
-
-       /* we'll need this info below */
-       blockNum = BufferGetBlockNumber(buffer);
-       freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData;
-
-       UnlockReleaseBuffer(buffer);
-
-       /* Remember first block number thus added. */
-       if (firstBlock == InvalidBlockNumber)
-           firstBlock = blockNum;
-
-       /*
-        * Immediately update the bottom level of the FSM.  This has a good
-        * chance of making this page visible to other concurrently inserting
-        * backends, and we want that to happen without delay.
-        */
-       RecordPageWithFreeSpace(relation, blockNum, freespace);
-   }
-   while (--extraBlocks > 0);
-
-   /*
-    * Updating the upper levels of the free space map is too expensive to do
-    * for every block, but it's worth doing once at the end to make sure that
-    * subsequent insertion activity sees all of those nifty free pages we
-    * just inserted.
-    */
-   FreeSpaceMapVacuumRange(relation, firstBlock, blockNum + 1);
-}
-
 /*
  * RelationGetBufferForTuple
  *
@@ -340,7 +325,6 @@ RelationGetBufferForTuple(Relation relation, Size len,
                saveFreeSpace = 0;
    BlockNumber targetBlock,
                otherBlock;
-   bool        needLock;
 
    len = MAXALIGN(len);        /* be conservative */
 
@@ -533,6 +517,23 @@ loop:
            ReleaseBuffer(buffer);
        }
 
+       /*
+        * FIXME: definitely needs a better solution.
+        */
+       if (!use_fsm && bistate && bistate->current_buf != InvalidBuffer)
+       {
+           BlockNumber blocknum = BufferGetBlockNumber(bistate->current_buf) + 1;
+
+           RelationOpenSmgr(relation);
+
+           if (blocknum < smgrnblocks(relation->rd_smgr, MAIN_FORKNUM))
+           {
+               targetBlock = blocknum;
+
+               goto loop;
+           }
+       }
+
        /* Without FSM, always fall out of the loop and extend */
        if (!use_fsm)
            break;
@@ -547,85 +548,10 @@ loop:
                                                    len + saveFreeSpace);
    }
 
-   /*
-    * Have to extend the relation.
-    *
-    * We have to use a lock to ensure no one else is extending the rel at the
-    * same time, else we will both try to initialize the same new page.  We
-    * can skip locking for new or temp relations, however, since no one else
-    * could be accessing them.
-    */
-   needLock = !RELATION_IS_LOCAL(relation);
-
-   /*
-    * If we need the lock but are not able to acquire it immediately, we'll
-    * consider extending the relation by multiple blocks at a time to manage
-    * contention on the relation extension lock.  However, this only makes
-    * sense if we're using the FSM; otherwise, there's no point.
-    */
-   if (needLock)
-   {
-       if (!use_fsm)
-           LockRelationForExtension(relation, ExclusiveLock);
-       else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock))
-       {
-           /* Couldn't get the lock immediately; wait for it. */
-           LockRelationForExtension(relation, ExclusiveLock);
-
-           /*
-            * Check if some other backend has extended a block for us while
-            * we were waiting on the lock.
-            */
-           targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
-
-           /*
-            * If some other waiter has already extended the relation, we
-            * don't need to do so; just use the existing freespace.
-            */
-           if (targetBlock != InvalidBlockNumber)
-           {
-               UnlockRelationForExtension(relation, ExclusiveLock);
-               goto loop;
-           }
-
-           /* Time to bulk-extend. */
-           RelationAddExtraBlocks(relation, bistate);
-       }
-   }
-
-   /*
-    * In addition to whatever extension we performed above, we always add at
-    * least one block to satisfy our own request.
-    *
-    * XXX This does an lseek - rather expensive - but at the moment it is the
-    * only way to accurately determine how many blocks are in a relation.  Is
-    * it worth keeping an accurate file length in shared memory someplace,
-    * rather than relying on the kernel to do it for us?
-    */
-   buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
+   buffer = ExtendRelation(relation, bistate, use_fsm);
 
-   /*
-    * We need to initialize the empty new page.  Double-check that it really
-    * is empty (this should never happen, but if it does we don't want to
-    * risk wiping out valid data).
-    */
    page = BufferGetPage(buffer);
 
-   if (!PageIsNew(page))
-       elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
-            BufferGetBlockNumber(buffer),
-            RelationGetRelationName(relation));
-
-   PageInit(page, BufferGetPageSize(buffer), 0);
-   MarkBufferDirty(buffer);
-
-   /*
-    * Release the file-extension lock; it's now OK for someone else to extend
-    * the relation some more.
-    */
-   if (needLock)
-       UnlockRelationForExtension(relation, ExclusiveLock);
-
    /*
     * Lock the other buffer. It's guaranteed to be of a lower page number
     * than the new page. To conform with the deadlock prevent rules, we ought
index a2068e3fd45d83026b97496e1ba0c3139eec4524..326ec8f44a87862e4647846088c10ba5565e66e9 100644 (file)
@@ -34,6 +34,7 @@
 #include "catalog/namespace.h"
 #include "catalog/pg_enum.h"
 #include "catalog/storage.h"
+#include "storage/aio.h"
 #include "commands/async.h"
 #include "commands/tablecmds.h"
 #include "commands/trigger.h"
@@ -2211,6 +2212,8 @@ CommitTransaction(void)
     */
    ProcArrayEndTransaction(MyProc, latestXid);
 
+   pgaio_at_commit();
+
    /*
     * This is all post-commit cleanup.  Note that if an error is raised here,
     * it's too late to abort the transaction.  This should be just
@@ -2620,6 +2623,8 @@ AbortTransaction(void)
    AtAbort_Memory();
    AtAbort_ResourceOwner();
 
+   pgaio_at_abort();
+
    /*
     * Release any LW locks we might be holding as quickly as possible.
     * (Regular locks, however, must be held till we finish aborting.)
index ed921c49e438da764eda33b4ccc5abc03b1b1e4e..1d168ed9c80d1c956d2fff2fb98747a6e7fe49ba 100644 (file)
@@ -59,6 +59,7 @@
 #include "replication/snapbuild.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
@@ -2450,6 +2451,17 @@ XLogCheckpointNeeded(XLogSegNo new_segno)
    return false;
 }
 
+
+void
+XLogWriteComplete(PgAioInProgress *aio, uint32 write_no)
+{
+}
+
+void
+XLogFlushComplete(struct PgAioInProgress *aio, uint32 flush_no)
+{
+}
+
 /*
  * Write and/or fsync the log at least as far as WriteRqst indicates.
  *
@@ -3283,6 +3295,11 @@ XLogNeedsFlush(XLogRecPtr record)
    return true;
 }
 
+static void
+XLogFileInitComplete(void *pgsw_private, void *write_private)
+{
+}
+
 /*
  * Create a new XLOG file segment, or open a pre-existing one.
  *
@@ -3313,6 +3330,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
    XLogSegNo   max_segno;
    int         fd;
    int         save_errno;
+   int         open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
 
    XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
 
@@ -3345,8 +3363,11 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 
    unlink(tmppath);
 
+   if (io_wal_init_direct)
+       open_flags |= PG_O_DIRECT;
+
    /* do not use get_sync_bit() here --- want to fsync only at end of fill */
-   fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+   fd = BasicOpenFile(tmppath, open_flags);
    if (fd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
@@ -3929,7 +3950,7 @@ XLogFileClose(void)
     * use the cache to read the WAL segment.
     */
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-   if (!XLogIsNeeded())
+   if (!XLogIsNeeded() && !io_wal_direct)
        (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
 
@@ -10464,10 +10485,17 @@ get_sync_bit(int method)
 {
    int         o_direct_flag = 0;
 
+   /* make O_DIRECT setting only depend on GUC */
+   if (io_wal_direct)
+       o_direct_flag |= PG_O_DIRECT;
+
+#if 0
    /* If fsync is disabled, never open in sync mode */
    if (!enableFsync)
        return 0;
+#endif
 
+#if 0
    /*
     * Optimize writes by bypassing kernel cache with O_DIRECT when using
     * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
@@ -10476,14 +10504,19 @@ get_sync_bit(int method)
     * read if we bypassed the kernel cache. We also skip the
     * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
     * reason.
-    *
+    */
+   if (!XLogIsNeeded())
+       o_direct_flag = PG_O_DIRECT;
+#endif
+
+   /*
     * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
     * written by walreceiver is normally read by the startup process soon
     * after it's written. Also, walreceiver performs unaligned writes, which
     * don't work with O_DIRECT, so it is required for correctness too.
     */
-   if (!XLogIsNeeded() && !AmWalReceiverProcess())
-       o_direct_flag = PG_O_DIRECT;
+   if (AmWalReceiverProcess())
+       return 0;
 
    switch (method)
    {
@@ -10496,7 +10529,7 @@ get_sync_bit(int method)
        case SYNC_METHOD_FSYNC:
        case SYNC_METHOD_FSYNC_WRITETHROUGH:
        case SYNC_METHOD_FDATASYNC:
-           return 0;
+           return o_direct_flag;
 #ifdef OPEN_SYNC_FLAG
        case SYNC_METHOD_OPEN:
            return OPEN_SYNC_FLAG | o_direct_flag;
index 6f615e66220b36ae056d646a750f12f478808007..604a4f75ac8f48a80c42bac2014f37965dde77b9 100644 (file)
@@ -432,6 +432,7 @@ AuxiliaryProcessMain(int argc, char *argv[])
             */
            SetProcessingMode(BootstrapProcessing);
            bootstrap_signals();
+           InitProcess();
            BootStrapXLOG();
            BootstrapModeMain();
            proc_exit(1);       /* should never return */
@@ -502,11 +503,6 @@ BootstrapModeMain(void)
    if (pg_link_canary_is_frontend())
        elog(ERROR, "backend is incorrectly linked to frontend functions");
 
-   /*
-    * Do backend-like initialization for bootstrap mode
-    */
-   InitProcess();
-
    InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
 
    /* Initialize stuff for bootstrap-file processing */
index 5d89e77dbe2f56982dc382bfc20f397130e4c7a8..ceae1d79c502bf7e4b576048409ef1690054e4cc 100644 (file)
@@ -1000,6 +1000,16 @@ CREATE VIEW pg_stat_wal AS
         w.stats_reset
     FROM pg_stat_get_wal() w;
 
+CREATE VIEW pg_stat_aio_backends AS
+    /* FIXME: easier to maintain without column names during development */
+    SELECT s.*
+    FROM pg_stat_get_aio_backends() s;
+
+CREATE VIEW pg_stat_aios AS
+    /* FIXME: easier to maintain without column names during development */
+    SELECT s.*
+    FROM pg_stat_get_aios() s;
+
 CREATE VIEW pg_stat_progress_analyze AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
index e208b5878dd367bf99358ec8e6b34346656c513c..bb17be74867ff878575975082ea8753f07f7dc16 100644 (file)
@@ -47,6 +47,7 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/interrupt.h"
 #include "replication/syncrep.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
@@ -707,13 +708,20 @@ CheckpointWriteDelay(int flags, double progress)
         */
        pgstat_send_bgwriter();
 
+       /*
+        * Ensure all pending IO is submitted to avoid unnecessary delays
+        * for other processes.
+        */
+       pgaio_submit_pending(true);
+
        /*
         * This sleep used to be connected to bgwriter_delay, typically 200ms.
         * That resulted in more frequent wakeups if not much work to do.
         * Checkpointer and bgwriter are no longer related so take the Big
         * Sleep.
         */
-       pg_usleep(100000L);
+       if (IsCheckpointOnSchedule(progress))
+           pg_usleep(100000L);
    }
    else if (--absorb_counter <= 0)
    {
index e1de64c60d095f7abc1ed0396f4b3fd803a0d16e..7a0947f10eb3963670caa89769e9832e66877580 100644 (file)
@@ -4308,9 +4308,15 @@ pgstat_get_wait_io(WaitEventIO w)
        case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN:
            event_name = "WALSyncMethodAssign";
            break;
+       case WAIT_EVENT_WAL_WAIT_FLUSH:
+           event_name = "WALWaitFlush";
+           break;
        case WAIT_EVENT_WAL_WAIT_INSERT:
            event_name = "WALWaitInsert";
            break;
+       case WAIT_EVENT_WAL_WAIT_WRITE:
+           event_name = "WALWaitWrite";
+           break;
        case WAIT_EVENT_WAL_WRITE:
            event_name = "WALWrite";
            break;
@@ -4326,6 +4332,21 @@ pgstat_get_wait_io(WaitEventIO w)
        case WAIT_EVENT_LOGICAL_SUBXACT_WRITE:
            event_name = "LogicalSubxactWrite";
            break;
+       case WAIT_EVENT_AIO_SUBMIT:
+           event_name = "AIOSubmit";
+           break;
+       case WAIT_EVENT_AIO_IO_COMPLETE_ANY:
+           event_name = "AIOCompleteAny";
+           break;
+       case WAIT_EVENT_AIO_IO_COMPLETE_ONE:
+           event_name = "AIOCompleteOne";
+           break;
+       case WAIT_EVENT_AIO_REFS:
+           event_name = "AIORefs";
+           break;
+       case WAIT_EVENT_AIO_BACKPRESSURE:
+           event_name = "AIOBackpressure";
+           break;
 
            /* no default case, so that compiler will warn */
    }
index 7de27ee4e0171863faca2f24d62488b773a7636e..5fe083518ea748ff07985e6cf6eb1af82ee16fbf 100644 (file)
 #include "postmaster/syslogger.h"
 #include "replication/logicallauncher.h"
 #include "replication/walsender.h"
+#include "storage/aio.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
@@ -1004,6 +1005,13 @@ PostmasterMain(int argc, char *argv[])
     */
    InitializeMaxBackends();
 
+   /*
+    * As AIO might create interal FDs, and will trigger shared memory
+    * allocations, need to do this before reset_shared() and
+    * set_max_safe_fds().
+    */
+   pgaio_postmaster_init();
+
    /*
     * Set up shared memory and semaphores.
     */
index d611dac26b87ea3bf02bc89b586b2ccf577c0565..97603a522c061e999b10f19523cd44459ebb359d 100644 (file)
@@ -123,6 +123,8 @@ InitBufferPool(void)
 
            buf->buf_id = i;
 
+           pgaio_io_ref_clear(&buf->io_in_progress);
+
            /*
             * Initially link all the buffers together as unused. Subsequent
             * management of this list is done by freelist.c.
index c5395c3061f4be00758fc9c22cf06e0ac7cb85bb..1b477d26cfe8e9eeb88ee0361821104fc5973f63 100644 (file)
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "storage/aio.h"
+#include "storage/buf.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
+#include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
@@ -149,6 +152,18 @@ int            checkpoint_flush_after = 0;
 int            bgwriter_flush_after = 0;
 int            backend_flush_after = 0;
 
+
+/*
+ * GUC variables related to the AIO subsystem.
+ *
+ * XXX: It's not that clear where these best belong? Particularly the WAL ones
+ * probably should move.
+ */
+bool       io_data_direct = 0;
+bool       io_data_force_async = 1;
+bool       io_wal_direct = 0;
+bool       io_wal_init_direct = 0;
+
 /* local state for StartBufferIO and related functions */
 static BufferDesc *InProgressBuf = NULL;
 static bool IsForInput;
@@ -453,17 +468,24 @@ static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
                                ForkNumber forkNum, BlockNumber blockNum,
                                ReadBufferMode mode, BufferAccessStrategy strategy,
                                bool *hit);
+static BufferDesc *ReadBuffer_start(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                BlockNumber blockNum, ReadBufferMode mode,
+                BufferAccessStrategy strategy, bool *hit, bool isLocalBuf);
 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
 static void BufferSync(int flags);
 static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
-static int SyncOneBuffer(int buf_id, bool skip_recently_used,
-                         WritebackContext *wb_context);
+static int BgBufferSyncWriteOne(int buf_id, bool skip_recently_used,
+                                pg_streaming_write *pgsw);
 static void WaitIO(BufferDesc *buf);
 static bool StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
-                             uint32 set_flag_bits);
+static void TerminateBufferIO(BufferDesc *buf, bool local, bool syncio,
+                             bool clear_dirty, uint32 set_flag_bits);
+static void TerminateSharedBufferIO(BufferDesc *buf,
+                                   bool sync_io,
+                                   bool clear_dirty,
+                                   uint32 set_flag_bits);
 static void shared_buffer_write_error_callback(void *arg);
 static void local_buffer_write_error_callback(void *arg);
 static BufferDesc *BufferAlloc(SMgrRelation smgr,
@@ -473,6 +495,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
                               BufferAccessStrategy strategy,
                               bool *foundPtr);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static bool AsyncFlushBuffer(PgAioInProgress *aio, BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
 static void CheckForBufferLeaks(void);
 static int rnode_comparator(const void *p1, const void *p2);
@@ -485,65 +508,30 @@ static int    ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
  * Implementation of PrefetchBuffer() for shared buffers.
  */
 PrefetchBufferResult
-PrefetchSharedBuffer(SMgrRelation smgr_reln,
+PrefetchSharedBuffer(Relation reln,
+                    SMgrRelation smgr_reln,
                     ForkNumber forkNum,
                     BlockNumber blockNum)
 {
    PrefetchBufferResult result = {InvalidBuffer, false};
-   BufferTag   newTag;         /* identity of requested block */
-   uint32      newHash;        /* hash value for newTag */
-   LWLock     *newPartitionLock;   /* buffer partition lock for it */
-   int         buf_id;
-
-   Assert(BlockNumberIsValid(blockNum));
-
-   /* create a tag so we can lookup the buffer */
-   INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
-                  forkNum, blockNum);
-
-   /* determine its hash code and partition lock ID */
-   newHash = BufTableHashCode(&newTag);
-   newPartitionLock = BufMappingPartitionLock(newHash);
-
-   /* see if the block is in the buffer pool already */
-   LWLockAcquire(newPartitionLock, LW_SHARED);
-   buf_id = BufTableLookup(&newTag, newHash);
-   LWLockRelease(newPartitionLock);
-
-   /* If not in buffers, initiate prefetch */
-   if (buf_id < 0)
-   {
-#ifdef USE_PREFETCH
-       /*
-        * Try to initiate an asynchronous read.  This returns false in
-        * recovery if the relation file doesn't exist.
-        */
-       if (smgrprefetch(smgr_reln, forkNum, blockNum))
-           result.initiated_io = true;
-#endif                         /* USE_PREFETCH */
-   }
-   else
-   {
-       /*
-        * Report the buffer it was in at that time.  The caller may be able
-        * to avoid a buffer table lookup, but it's not pinned and it must be
-        * rechecked!
-        */
-       result.recent_buffer = buf_id + 1;
-   }
+   bool already_valid;
 
    /*
-    * If the block *is* in buffers, we do nothing.  This is not really ideal:
-    * the block might be just about to be evicted, which would be stupid
-    * since we know we are going to need it soon.  But the only easy answer
-    * is to bump the usage_count, which does not seem like a great solution:
-    * when the caller does ultimately touch the block, usage_count would get
-    * bumped again, resulting in too much favoritism for blocks that are
-    * involved in a prefetch sequence. A real fix would involve some
-    * additional per-buffer state, and it's not clear that there's enough of
-    * a problem to justify that.
+    * Report the buffer it was in at that time.  The caller may be able
+    * to avoid a buffer table lookup, but it's not pinned and it must be
+    * rechecked!
     */
 
+   result.recent_buffer = ReadBufferAsync(reln, forkNum, blockNum, RBM_NORMAL,
+                                          NULL, &already_valid, NULL);
+   result.initiated_io = !already_valid;
+
+   if (already_valid)
+       ReleaseBuffer(result.recent_buffer);
+#if 0
+   else
+       pgaio_submit_pending(true);
+#endif
    return result;
 }
 
@@ -594,7 +582,7 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
    else
    {
        /* pass it to the shared buffer version */
-       return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
+       return PrefetchSharedBuffer(reln, reln->rd_smgr, forkNum, blockNum);
    }
 }
 
@@ -707,106 +695,173 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
                             mode, strategy, &hit);
 }
 
+static void
+ReadBufferInitRead(PgAioInProgress *aio,
+                  SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
+                  Buffer buf, BufferDesc *bufHdr, int mode)
+{
+   Block       bufBlock;
+
+   //bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+   if (BufferIsLocal(buf))
+       bufBlock = LocalBufHdrGetBlock(bufHdr);
+   else
+       bufBlock = BufHdrGetBlock(bufHdr);
+
+   /*
+    * if we have gotten to this point, we have allocated a buffer for the
+    * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
+    * if it's a shared buffer.
+    */
+   Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));   /* spinlock not needed */
+
+   /* FIXME: improve */
+   InProgressBuf = NULL;
+
+   pgaio_io_ref(aio, &bufHdr->io_in_progress);
+
+   smgrstartread(aio, smgr, forkNum, blockNum,
+                 bufBlock, buf, mode);
+}
+
+Buffer
+ReadBufferAsync(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
+               ReadBufferMode mode, BufferAccessStrategy strategy,
+               bool *already_valid, PgAioInProgress **aiop)
+{
+   Buffer buf;
+   BufferDesc *bufHdr;
+   bool hit;
+   bool release_io;
+   PgAioInProgress *aio;
+
+   if (mode != RBM_NORMAL || blockNum == P_NEW)
+       elog(ERROR, "unsupported");
+
+   /*
+    * Don't support AIO for local buffers yet, so just fall back to operating
+    * synchronously. This is important because otherwise callers would all
+    * need to have a non-prefetching fallback implementation.
+    */
+   if (RelationUsesLocalBuffers(reln))
+   {
+       *already_valid = true;
+       return ReadBufferExtended(reln, forkNum, blockNum, mode, strategy);
+   }
+
+   /* Open it at the smgr level if not already done */
+   RelationOpenSmgr(reln);
+
+   pgstat_count_buffer_read(reln);
+
+   bufHdr = ReadBuffer_start(reln->rd_smgr, reln->rd_rel->relpersistence, forkNum,
+                             blockNum, mode, strategy, &hit, false);
+   buf = BufferDescriptorGetBuffer(bufHdr);
+
+   if (hit)
+   {
+       pgstat_count_buffer_hit(reln);
+
+       //Assert(BufferIsPinned(buf));
+       *already_valid = true;
+       return buf;
+   }
+
+   *already_valid = false;
+
+   if (aiop == NULL)
+   {
+       release_io = true;
+       aio = pgaio_io_get();
+   }
+   else if(*aiop == NULL)
+   {
+       release_io = false;
+       *aiop = aio = pgaio_io_get();
+   }
+   else
+   {
+       release_io = false;
+       aio = *aiop;
+   }
+
+   /*
+    * FIXME: Not accurate anymore.
+    * Decrement local pin, but keep shared pin. The latter will be released
+    * upon completion of the IO. Otherwise the buffer could be recycled while
+    * the IO is ongoing.
+    *
+    * FIXME: Make this optional? It's only useful for fire-and-forget style
+    * IO.
+    */
+   if (!release_io)
+   {
+       uint32      buf_state;
+
+       buf_state = LockBufHdr(bufHdr);
+       buf_state += BUF_REFCOUNT_ONE;
+       UnlockBufHdr(bufHdr, buf_state);
+   }
+   else
+   {
+       PrivateRefCountEntry *ref;
+
+       ref = GetPrivateRefCountEntry(buf, false);
+       Assert(ref != NULL);
+       Assert(ref->refcount > 0);
+
+       ResourceOwnerForgetBuffer(CurrentResourceOwner, buf);
+       ref->refcount--;
+       ForgetPrivateRefCountEntry(ref);
+   }
+
+   ReadBufferInitRead(aio, reln->rd_smgr, forkNum, blockNum, buf, bufHdr, mode);
+
+   if (release_io)
+       pgaio_io_release(aio);
+
+   return buf;
+}
 
-/*
- * ReadBuffer_common -- common logic for all ReadBuffer variants
- *
- * *hit is set to true if the request was satisfied from shared buffer cache.
- */
 static Buffer
-ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ReadBuffer_extend(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                  BlockNumber blockNum, ReadBufferMode mode,
-                 BufferAccessStrategy strategy, bool *hit)
+                 BufferAccessStrategy strategy, bool *hit, bool isLocalBuf)
 {
+   bool        found;
    BufferDesc *bufHdr;
    Block       bufBlock;
-   bool        found;
-   bool        isExtend;
-   bool        isLocalBuf = SmgrIsTemp(smgr);
 
    *hit = false;
 
-   /* Make sure we will have room to remember the buffer pin */
-   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-
-   isExtend = (blockNum == P_NEW);
-
    TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
                                       smgr->smgr_rnode.node.spcNode,
                                       smgr->smgr_rnode.node.dbNode,
                                       smgr->smgr_rnode.node.relNode,
                                       smgr->smgr_rnode.backend,
-                                      isExtend);
+                                      true);
+
+   /* Make sure we will have room to remember the buffer pin */
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
-   /* Substitute proper block number if caller asked for P_NEW */
-   if (isExtend)
-       blockNum = smgrnblocks(smgr, forkNum);
+   /* Substitute proper block number */
+   blockNum = smgrnblocks(smgr, forkNum);
 
    if (isLocalBuf)
    {
        bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
-       if (found)
-           pgBufferUsage.local_blks_hit++;
-       else if (isExtend)
-           pgBufferUsage.local_blks_written++;
-       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
-                mode == RBM_ZERO_ON_ERROR)
-           pgBufferUsage.local_blks_read++;
+       pgBufferUsage.local_blks_written++;
    }
    else
    {
-       /*
-        * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
-        * not currently in memory.
-        */
        bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
                             strategy, &found);
-       if (found)
-           pgBufferUsage.shared_blks_hit++;
-       else if (isExtend)
-           pgBufferUsage.shared_blks_written++;
-       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
-                mode == RBM_ZERO_ON_ERROR)
-           pgBufferUsage.shared_blks_read++;
+       pgBufferUsage.shared_blks_written++;
    }
 
-   /* At this point we do NOT hold any locks. */
-
-   /* if it was already in the buffer pool, we're done */
    if (found)
    {
-       if (!isExtend)
-       {
-           /* Just need to update stats before we exit */
-           *hit = true;
-           VacuumPageHit++;
-
-           if (VacuumCostActive)
-               VacuumCostBalance += VacuumCostPageHit;
-
-           TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
-                                             smgr->smgr_rnode.node.spcNode,
-                                             smgr->smgr_rnode.node.dbNode,
-                                             smgr->smgr_rnode.node.relNode,
-                                             smgr->smgr_rnode.backend,
-                                             isExtend,
-                                             found);
-
-           /*
-            * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
-            * locked on return.
-            */
-           if (!isLocalBuf)
-           {
-               if (mode == RBM_ZERO_AND_LOCK)
-                   LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
-                                 LW_EXCLUSIVE);
-               else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
-                   LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
-           }
-
-           return BufferDescriptorGetBuffer(bufHdr);
-       }
+       Block       bufBlock;
 
        /*
         * We get here only in the corner case where we are trying to extend
@@ -859,13 +914,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                UnlockBufHdr(bufHdr, buf_state);
            } while (!StartBufferIO(bufHdr, true));
        }
+
    }
 
    /*
-    * if we have gotten to this point, we have allocated a buffer for the
-    * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
-    * if it's a shared buffer.
-    *
     * Note: if smgrextend fails, we will end up with a buffer that is
     * allocated but not marked BM_VALID.  P_NEW will still select the same
     * block number (because the relation didn't get any longer on disk) and
@@ -877,68 +929,17 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
    bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
-   if (isExtend)
-   {
-       /* new buffers are zero-filled */
-       MemSet((char *) bufBlock, 0, BLCKSZ);
-       /* don't set checksum for all-zero page */
-       smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
-
-       /*
-        * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
-        * although we're essentially performing a write. At least on linux
-        * doing so defeats the 'delayed allocation' mechanism, leading to
-        * increased file fragmentation.
-        */
-   }
-   else
-   {
-       /*
-        * Read in the page, unless the caller intends to overwrite it and
-        * just wants us to allocate a buffer.
-        */
-       if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
-           MemSet((char *) bufBlock, 0, BLCKSZ);
-       else
-       {
-           instr_time  io_start,
-                       io_time;
-
-           if (track_io_timing)
-               INSTR_TIME_SET_CURRENT(io_start);
-
-           smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+   /* new buffers are zero-filled */
+   MemSet((char *) bufBlock, 0, BLCKSZ);
+   /* don't set checksum for all-zero page */
+   smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
 
-           if (track_io_timing)
-           {
-               INSTR_TIME_SET_CURRENT(io_time);
-               INSTR_TIME_SUBTRACT(io_time, io_start);
-               pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
-               INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
-           }
-
-           /* check for garbage data */
-           if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
-                                       PIV_LOG_WARNING | PIV_REPORT_STAT))
-           {
-               if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
-               {
-                   ereport(WARNING,
-                           (errcode(ERRCODE_DATA_CORRUPTED),
-                            errmsg("invalid page in block %u of relation %s; zeroing out page",
-                                   blockNum,
-                                   relpath(smgr->smgr_rnode, forkNum))));
-                   MemSet((char *) bufBlock, 0, BLCKSZ);
-               }
-               else
-                   ereport(ERROR,
-                           (errcode(ERRCODE_DATA_CORRUPTED),
-                            errmsg("invalid page in block %u of relation %s",
-                                   blockNum,
-                                   relpath(smgr->smgr_rnode, forkNum))));
-           }
-       }
-   }
+   /*
+    * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
+    * although we're essentially performing a write. At least on linux
+    * doing so defeats the 'delayed allocation' mechanism, leading to
+    * increased file fragmentation.
+    */
 
    /*
     * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
@@ -956,18 +957,226 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
        LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
    }
 
+
+   TerminateBufferIO(bufHdr, isLocalBuf,
+                     /* syncio = */ true, /* clear_dirty = */ false,
+                     BM_VALID);
+
+   return BufferDescriptorGetBuffer(bufHdr);
+}
+
+static BufferDesc *
+ReadBuffer_start(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                BlockNumber blockNum, ReadBufferMode mode,
+                BufferAccessStrategy strategy, bool *hit, bool isLocalBuf)
+{
+   BufferDesc *bufHdr;
+   bool        found;
+
+   Assert(blockNum != P_NEW);
+
+   *hit = false;
+
+   /* Make sure we will have room to remember the buffer pin */
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+   TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
+                                      smgr->smgr_rnode.node.spcNode,
+                                      smgr->smgr_rnode.node.dbNode,
+                                      smgr->smgr_rnode.node.relNode,
+                                      smgr->smgr_rnode.backend,
+                                      false);
+
    if (isLocalBuf)
    {
-       /* Only need to adjust flags */
-       uint32      buf_state = pg_atomic_read_u32(&bufHdr->state);
+       bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
+       if (found)
+           pgBufferUsage.local_blks_hit++;
+       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+                mode == RBM_ZERO_ON_ERROR)
+           pgBufferUsage.local_blks_read++;
+   }
+   else
+   {
+       /*
+        * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
+        * not currently in memory.
+        */
+       bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
+                            strategy, &found);
+       if (found)
+           pgBufferUsage.shared_blks_hit++;
+       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+                mode == RBM_ZERO_ON_ERROR)
+           pgBufferUsage.shared_blks_read++;
+   }
 
-       buf_state |= BM_VALID;
-       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+   /* At this point we do NOT hold any locks. */
+
+   /* if it was already in the buffer pool, we're done */
+   if (found)
+   {
+       /* Just need to update stats before we exit */
+       *hit = true;
+       VacuumPageHit++;
+
+       if (VacuumCostActive)
+           VacuumCostBalance += VacuumCostPageHit;
+
+       TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+                                         smgr->smgr_rnode.node.spcNode,
+                                         smgr->smgr_rnode.node.dbNode,
+                                         smgr->smgr_rnode.node.relNode,
+                                         smgr->smgr_rnode.backend,
+                                         false,
+                                         found);
+
+       /*
+        * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
+        * locked on return.
+        */
+       if (!isLocalBuf)
+       {
+           if (mode == RBM_ZERO_AND_LOCK)
+               LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+                             LW_EXCLUSIVE);
+           else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+               LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+       }
+
+       return bufHdr;
+   }
+   else if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+   {
+       /*
+        * The caller intends to overwrite the page and just wants us to
+        * allocate a buffer. Finish IO here, so sync/async don't have to
+        * duplicate the logic.
+        */
+
+       Block       bufBlock;
+
+       bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+
+       MemSet((char *) bufBlock, 0, BLCKSZ);
+
+       /*
+        * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
+        * the page as valid, to make sure that no other backend sees the zeroed
+        * page before the caller has had a chance to initialize it.
+        *
+        * Since no-one else can be looking at the page contents yet, there is no
+        * difference between an exclusive lock and a cleanup-strength lock. (Note
+        * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
+        * they assert that the buffer is already valid.)
+        */
+       if (!isLocalBuf)
+       {
+           LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+       }
+
+       TerminateBufferIO(bufHdr, isLocalBuf,
+                         /* syncio = */ true, /* clear_dirty = */ false,
+                         BM_VALID);
+
+       *hit = true;
+   }
+
+   return bufHdr;
+}
+
+/*
+ * ReadBuffer_common -- common logic for all ReadBuffer variants
+ *
+ * *hit is set to true if the request was satisfied from shared buffer cache.
+ */
+static Buffer
+ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                 BlockNumber blockNum, ReadBufferMode mode,
+                 BufferAccessStrategy strategy, bool *hit)
+{
+   Block       bufBlock;
+   bool        isLocalBuf = SmgrIsTemp(smgr);
+   BufferDesc *bufHdr;
+   Buffer      buf;
+   instr_time  io_start, io_time;
+
+   if (blockNum == P_NEW)
+       return ReadBuffer_extend(smgr, relpersistence, forkNum,
+                                blockNum, mode, strategy,
+                                hit, isLocalBuf);
+
+
+   bufHdr = ReadBuffer_start(smgr, relpersistence, forkNum,
+                             blockNum, mode, strategy,
+                             hit, isLocalBuf);
+
+   if (*hit)
+       return BufferDescriptorGetBuffer(bufHdr);
+
+   /*
+    * if we have gotten to this point, we have allocated a buffer for the
+    * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
+    * if it's a shared buffer.
+    */
+   Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));   /* spinlock not needed */
+
+   bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+   buf = BufferDescriptorGetBuffer(bufHdr);
+
+   if (isLocalBuf || !io_data_force_async)
+   {
+       if (track_io_timing)
+           INSTR_TIME_SET_CURRENT(io_start);
+
+       smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+
+       if (track_io_timing)
+       {
+           INSTR_TIME_SET_CURRENT(io_time);
+           INSTR_TIME_SUBTRACT(io_time, io_start);
+           pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+           INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+       }
+
+       /* check for garbage data */
+       if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
+                                   PIV_LOG_WARNING | PIV_REPORT_STAT))
+       {
+           if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+           {
+               ereport(ERROR,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s; zeroing out page",
+                               blockNum,
+                               relpath(smgr->smgr_rnode, forkNum))));
+               MemSet((char *) bufBlock, 0, BLCKSZ);
+           }
+           else
+               ereport(ERROR,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s",
+                               blockNum,
+                               relpath(smgr->smgr_rnode, forkNum))));
+       }
+
+       TerminateBufferIO(bufHdr, isLocalBuf,
+                         /* syncio = */ true, /* clear_dirty = */ false,
+                         BM_VALID);
    }
    else
    {
-       /* Set BM_VALID, terminate IO, and wake up any waiters */
-       TerminateBufferIO(bufHdr, false, BM_VALID);
+       PgAioInProgress* aio = pgaio_io_get();
+       uint32      buf_state;
+
+       buf_state = LockBufHdr(bufHdr);
+       buf_state += BUF_REFCOUNT_ONE;
+       UnlockBufHdr(bufHdr, buf_state);
+
+       ReadBufferInitRead(aio, smgr, forkNum, blockNum, buf, bufHdr, mode);
+       pgaio_io_wait(aio);
+       pgaio_io_release(aio);
    }
 
    VacuumPageMiss++;
@@ -979,12 +1188,91 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                                      smgr->smgr_rnode.node.dbNode,
                                      smgr->smgr_rnode.node.relNode,
                                      smgr->smgr_rnode.backend,
-                                     isExtend,
-                                     found);
+                                     false,
+                                     false);
 
    return BufferDescriptorGetBuffer(bufHdr);
 }
 
+void
+ReadBufferCompleteRead(Buffer buffer, const AioBufferTag *tag, char *bufdata, int mode, bool failed)
+{
+   /* FIXME: implement track_io_timing */
+
+   if (!failed)
+   {
+       Block       bufBlock = (Block) bufdata;
+       BlockNumber blockNum = tag->blockNum;
+
+       /* check for garbage data */
+       if (!PageIsVerified((Page) bufdata, blockNum))
+       {
+           RelFileNode rnode = tag->rnode.node;
+           BlockNumber forkNum = tag->forkNum;
+
+           failed = true;
+
+           if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+           {
+               ereport(WARNING,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s; zeroing out page",
+                               blockNum,
+                               relpathperm(rnode, forkNum))));
+               MemSet((char *) bufBlock, 0, BLCKSZ);
+           }
+           else
+           {
+               ereport(ERROR,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s",
+                               blockNum,
+                               relpathperm(rnode, forkNum))));
+           }
+       }
+   }
+
+   if (BufferIsValid(buffer))
+   {
+       bool        islocal = BufferIsLocal(buffer);
+       BufferDesc *bufHdr;
+
+       if (islocal)
+           bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+       else
+           bufHdr = GetBufferDescriptor(buffer - 1);
+
+       TerminateBufferIO(bufHdr, islocal,
+                         /* syncio = */ false, /* clear_dirty = */ false,
+                         failed ? BM_IO_ERROR : BM_VALID);
+   }
+}
+
+void
+ReadBufferCompleteWrite(Buffer buffer, bool failed, bool release_lock)
+{
+   BufferDesc *bufHdr;
+   bool        islocal = BufferIsLocal(buffer);
+
+   if (islocal)
+       bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+   else
+       bufHdr = GetBufferDescriptor(buffer - 1);
+
+   /* FIXME: implement track_io_timing */
+
+   TerminateBufferIO(bufHdr, islocal,
+                     /* syncio = */ false, /* clear_dirty = */ true,
+                     failed ? BM_IO_ERROR : 0);
+
+   /*
+    * The initiator of IO is not managing the lock (i.e. called
+    * LWLockReleaseOwnership()), we are.
+    */
+   if (release_lock)
+       LWLockReleaseUnowned(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+}
+
 /*
  * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
  *     buffer.  If no buffer exists already, selects a replacement
@@ -1022,6 +1310,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
    bool        valid;
    uint32      buf_state;
 
+   Assert(blockNum != P_NEW);
+
    /* create a tag so we can lookup the buffer */
    INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
@@ -1432,30 +1722,402 @@ retry:
        goto retry;
    }
 
-   /*
-    * Clear out the buffer's tag and flags.  We must do this to ensure that
-    * linear scans of the buffer array don't think the buffer is valid.
-    */
-   oldFlags = buf_state & BUF_FLAG_MASK;
-   CLEAR_BUFFERTAG(buf->tag);
-   buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
-   UnlockBufHdr(buf, buf_state);
+   /*
+    * Clear out the buffer's tag and flags.  We must do this to ensure that
+    * linear scans of the buffer array don't think the buffer is valid.
+    */
+   oldFlags = buf_state & BUF_FLAG_MASK;
+   CLEAR_BUFFERTAG(buf->tag);
+   buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+   UnlockBufHdr(buf, buf_state);
+
+   /*
+    * Remove the buffer from the lookup hashtable, if it was in there.
+    */
+   if (oldFlags & BM_TAG_VALID)
+       BufTableDelete(&oldTag, oldHash);
+
+   /*
+    * Done with mapping lock.
+    */
+   LWLockRelease(oldPartitionLock);
+
+   /*
+    * Insert the buffer at the head of the list of free buffers.
+    */
+   StrategyFreeBuffer(buf);
+}
+
+static bool
+bulk_extend_buffer_inval(BufferDesc *buf_hdr)
+{
+   uint32      buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+   Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+   Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) > 0);
+
+   /* can't change while we're holding the pin */
+   if (buf_state & BM_TAG_VALID)
+   {
+       uint32      hash;
+       LWLock     *partition_lock;
+       BufferTag   tag;
+       uint32      old_flags;
+
+       /* have buffer pinned, so it's safe to read tag without lock */
+       tag = buf_hdr->tag;
+       hash = BufTableHashCode(&tag);
+       partition_lock = BufMappingPartitionLock(hash);
+
+       LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+       /* lock the buffer header */
+       buf_state = LockBufHdr(buf_hdr);
+       old_flags = buf_state & BUF_FLAG_MASK;
+
+       /*
+        * If somebody else pinned the buffer since, or even worse, dirtied it,
+        * give up on this buffer: It's clearly in use.
+        */
+       if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
+       {
+           Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+           UnlockBufHdr(buf_hdr, buf_state);
+           LWLockRelease(partition_lock);
+
+           return false;
+       }
+
+       /*
+        * Clear out the buffer's tag and flags.  We must do this to ensure that
+        * linear scans of the buffer array don't think the buffer is valid.
+        */
+       CLEAR_BUFFERTAG(buf_hdr->tag);
+       buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+       UnlockBufHdr(buf_hdr, buf_state);
+
+       if (old_flags & BM_TAG_VALID)
+           BufTableDelete(&tag, hash);
+
+       LWLockRelease(partition_lock);
+   }
+
+   return true;
+}
+
+
+typedef struct BulkExtendOneBuffer
+{
+   BufferDesc *buf_hdr;
+   dlist_node node;
+} BulkExtendOneBuffer;
+
+typedef struct BulkExtendBufferedState
+{
+   int acquired_buffers_count;
+   int pending_buffers_count;
+
+   dlist_head acquired_buffers;
+   dlist_head pending_buffers;
+
+   dlist_head allocated_buffers;
+
+   pg_streaming_write *pgsw;
+
+   BulkExtendOneBuffer ios[];
+} BulkExtendBufferedState;
+
+static void
+bulk_extend_undirty_complete(void *pgsw_private, void *write_private)
+{
+   BulkExtendBufferedState *be_state = (BulkExtendBufferedState * ) pgsw_private;
+   BulkExtendOneBuffer *ex_buf = (BulkExtendOneBuffer *) write_private;
+   BufferTag   tag;
+
+   /* the buffer lock has already been released by ReadBufferCompleteWrite */
+
+   tag = ex_buf->buf_hdr->tag;
+   be_state->pending_buffers_count--;
+   dlist_delete_from(&be_state->pending_buffers, &ex_buf->node);
+
+   if (bulk_extend_buffer_inval(ex_buf->buf_hdr))
+   {
+       dlist_push_head(&be_state->acquired_buffers, &ex_buf->node);
+       be_state->acquired_buffers_count++;
+   }
+   else
+   {
+       dlist_push_tail(&be_state->allocated_buffers, &ex_buf->node);
+       UnpinBuffer(ex_buf->buf_hdr, true);
+   }
+
+   ScheduleBufferTagForWriteback(&BackendWritebackContext, &tag);
+}
+
+/*
+ * WIP interface to more efficient relation extension.
+ *
+ * Todo:
+ *
+ * - Write initialized buffers - otherwise we'll waste a lot of time doing
+ *   another set of memsets at PageInit(), as well as making PageIsVerified()
+ *   a lot more expensive (verifying all-zeroes).
+ * - De-duplication of work between concurrent extensions?
+ * - Chunking, to avoid pinning quite as many buffers at once
+ * - Strategy integration
+ * - cleanup
+ *
+ */
+extern Buffer
+BulkExtendBuffered(Relation relation, ForkNumber forkNum, int extendby, BufferAccessStrategy strategy)
+{
+   BulkExtendBufferedState *be_state;
+   bool        need_extension_lock = !RELATION_IS_LOCAL(relation);
+   BlockNumber start_nblocks;
+   SMgrRelation smgr;
+   BufferDesc *return_buf_hdr = NULL;
+   char relpersistence = relation->rd_rel->relpersistence;
+   dlist_iter iter;
+   BlockNumber extendto;
+   bool first;
+
+   RelationOpenSmgr(relation);
+   smgr = relation->rd_smgr;
+
+   be_state = palloc0(offsetof(BulkExtendBufferedState, ios) + sizeof(BulkExtendOneBuffer) * extendby);
+
+   dlist_init(&be_state->acquired_buffers);
+   dlist_init(&be_state->pending_buffers);
+   dlist_init(&be_state->allocated_buffers);
+
+   for (int i = 0; i < extendby; i++)
+   {
+       dlist_push_tail(&be_state->allocated_buffers, &be_state->ios[i].node);
+   }
+
+   be_state->pgsw = pg_streaming_write_alloc(128, be_state, bulk_extend_undirty_complete);
+
+   while (be_state->acquired_buffers_count < extendby)
+   {
+       uint32 cur_old_flags;
+       uint32 cur_buf_state;
+       BufferDesc *cur_buf_hdr;
+       BulkExtendOneBuffer *cur_ex_buf = NULL;
+       bool buffer_usable;
+       bool buffer_io;
+
+       /*
+        * If there's buffers being written out that might or might not be
+        * available, depending on whether they've concurrently been pinned,
+        * wait for all of those to finish, and check again.
+        */
+       if ((be_state->acquired_buffers_count + be_state->pending_buffers_count) >= extendby)
+       {
+           pg_streaming_write_wait_all(be_state->pgsw);
+           continue;
+       }
+
+       Assert(!dlist_is_empty(&be_state->allocated_buffers));
+       cur_ex_buf = dlist_container(BulkExtendOneBuffer, node,
+                                    dlist_pop_head_node(&be_state->allocated_buffers));
+
+       ReservePrivateRefCountEntry();
+       ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+       cur_buf_hdr = StrategyGetBuffer(NULL, &cur_buf_state);
+       cur_ex_buf->buf_hdr = cur_buf_hdr;
+
+       Assert(BUF_STATE_GET_REFCOUNT(cur_buf_state) == 0);
+
+       /* Must copy buffer flags while we still hold the spinlock */
+       cur_old_flags = cur_buf_state & BUF_FLAG_MASK;
+
+       /* Pin the buffer and then release the buffer spinlock */
+       PinBuffer_Locked(cur_buf_hdr);
+
+       if (cur_old_flags & BM_DIRTY)
+       {
+           LWLock *content_lock;
+           PgAioInProgress *aio;
+
+           content_lock = BufferDescriptorGetContentLock(cur_buf_hdr);
+
+           /*
+            * NB: this protect against deadlocks due to holding multiple
+            * buffer locks, as well as avoids unnecessary blocking (see
+            * BufferAlloc() for the latter).
+            */
+           if (LWLockConditionalAcquire(content_lock, LW_SHARED))
+           {
+               aio = pg_streaming_write_get_io(be_state->pgsw);
+
+               // XXX: could use strategy reject logic here too
+               if (AsyncFlushBuffer(aio, cur_buf_hdr, NULL))
+               {
+                   buffer_usable = true;
+                   buffer_io = true;
+
+                   pg_streaming_write_write(be_state->pgsw, aio, cur_ex_buf);
+               }
+               else
+               {
+                   buffer_io = false;
+
+                   if (!bulk_extend_buffer_inval(cur_buf_hdr))
+                       buffer_usable = false;
+                   else
+                       buffer_usable = true;
+
+                   LWLockRelease(content_lock);
+               }
+           }
+           else
+           {
+               /*
+                * Someone else has locked the buffer, so give it up and loop
+                * back to get another one.
+                */
+               buffer_usable = false;
+               buffer_io = false;
+           }
+       }
+       else
+       {
+           /*
+            * This buffer can be used, unless it's getting pinned
+            * concurrently.
+            */
+           buffer_io = false;
+
+           if (!bulk_extend_buffer_inval(cur_buf_hdr))
+               buffer_usable = false;
+           else
+               buffer_usable = true;
+       }
+
+       if (buffer_io)
+       {
+           dlist_push_tail(&be_state->pending_buffers, &cur_ex_buf->node);
+           be_state->pending_buffers_count++;
+       }
+       else if (buffer_usable)
+       {
+           dlist_push_head(&be_state->acquired_buffers, &cur_ex_buf->node);
+           be_state->acquired_buffers_count++;
+       }
+       else
+       {
+           UnpinBuffer(cur_ex_buf->buf_hdr, true);
+           dlist_push_tail(&be_state->allocated_buffers, &cur_ex_buf->node);
+       }
+   }
+
+   // FIXME: shouldn't be needed
+   pg_streaming_write_wait_all(be_state->pgsw);
+   pg_streaming_write_free(be_state->pgsw);
 
    /*
-    * Remove the buffer from the lookup hashtable, if it was in there.
+    * Now we have our hands on N buffers that are guaranteed to be clean
+    * (since they are pinned they cannot be reused by other backends).
+    *
+    * Now acquire extension lock, extend relation, and try to point the
+    * victim buffers acquired above to the extended part of the relation.
     */
-   if (oldFlags & BM_TAG_VALID)
-       BufTableDelete(&oldTag, oldHash);
+   if (need_extension_lock)
+       LockRelationForExtension(relation, ExclusiveLock);
 
-   /*
-    * Done with mapping lock.
-    */
-   LWLockRelease(oldPartitionLock);
+   start_nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+   extendto = start_nblocks;
 
    /*
-    * Insert the buffer at the head of the list of free buffers.
+    * Set up identities of all the new buffers. This way there cannot be race
+    * conditions where other backends lock the returned page first.
     */
-   StrategyFreeBuffer(buf);
+   first = true;
+   dlist_foreach(iter, &be_state->acquired_buffers)
+   {
+       BulkExtendOneBuffer *ex_buf = dlist_container(BulkExtendOneBuffer, node, iter.cur);
+       BufferDesc *new_buf_hdr = ex_buf->buf_hdr;
+       BufferTag   new_tag;
+       uint32      new_hash;
+       int         existing_buf;
+       uint32      buf_state;
+       LWLock     *partition_lock;
+
+       Assert(extendto < start_nblocks + extendby);
+       Assert(ex_buf->buf_hdr);
+
+       INIT_BUFFERTAG(new_tag, smgr->smgr_rnode.node, forkNum, extendto);
+       new_hash = BufTableHashCode(&new_tag);
+
+       partition_lock = BufMappingPartitionLock(new_hash);
+       LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+       existing_buf = BufTableInsert(&new_tag, new_hash, new_buf_hdr->buf_id);
+       if (existing_buf >= 0)
+       {
+           /* FIXME: This is probably possible when extension fails due to ENOSPC or such */
+           elog(ERROR, "buffer beyond EOF");
+       }
+
+       /* lock to install new identity */
+       buf_state = LockBufHdr(new_buf_hdr);
+
+       buf_state |= BM_TAG_VALID | BM_IO_IN_PROGRESS | BUF_USAGECOUNT_ONE * BM_MAX_USAGE_COUNT;
+
+       if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
+           buf_state |= BM_PERMANENT;
+
+       new_buf_hdr->tag = new_tag;
+       UnlockBufHdr(new_buf_hdr, buf_state);
+
+       LWLockRelease(partition_lock);
+
+       if (first)
+       {
+           return_buf_hdr = ex_buf->buf_hdr;
+           first = false;
+       }
+
+       extendto++;
+   }
+   Assert(extendto == start_nblocks + extendby);
+
+   /* finally extend the relation */
+   smgrzeroextend(relation->rd_smgr, forkNum, start_nblocks,
+                  extendby, false);
+
+   /* Ensure taht the returned buffer cannot be reached by another backend first */
+   LWLockAcquire(BufferDescriptorGetContentLock(return_buf_hdr), LW_EXCLUSIVE);
+
+   /* Mark all buffers as having completed */
+   dlist_foreach(iter, &be_state->acquired_buffers)
+   {
+       BulkExtendOneBuffer *ex_buf = dlist_container(BulkExtendOneBuffer, node, iter.cur);
+       BufferDesc *new_buf_hdr = ex_buf->buf_hdr;
+       Block       new_buf_block;
+       uint32      buf_state;
+
+       new_buf_block = BufHdrGetBlock(new_buf_hdr);
+
+       /* new buffers are zero-filled */
+       memset((char *) __builtin_assume_aligned(new_buf_block, 4096), 0, BLCKSZ);
+
+       buf_state = LockBufHdr(new_buf_hdr);
+       buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
+       buf_state |= BM_VALID;
+       UnlockBufHdr(new_buf_hdr, buf_state);
+       ConditionVariableBroadcast(BufferDescriptorGetIOCV(new_buf_hdr));
+
+       if (new_buf_hdr != return_buf_hdr)
+           UnpinBuffer(new_buf_hdr, true);
+   }
+
+   if (need_extension_lock)
+       UnlockRelationForExtension(relation, ExclusiveLock);
+
+   return BufferDescriptorGetBuffer(return_buf_hdr);
 }
 
 /*
@@ -1822,6 +2484,79 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
    }
 }
 
+static void
+buffer_sync_complete(void *pgsw_private, void *write_private)
+{
+   WritebackContext *wb_context = (WritebackContext *) pgsw_private;
+   BufferDesc *bufHdr = (BufferDesc *) write_private;
+   BufferTag   tag;
+
+   /* the buffer lock has already been released by ReadBufferCompleteWrite */
+
+   tag = bufHdr->tag;
+   UnpinBuffer(bufHdr, true);
+
+   if (wb_context)
+       ScheduleBufferTagForWriteback(wb_context, &tag);
+}
+
+static bool
+BufferSyncWriteOne(pg_streaming_write *pgsw, BufferDesc *bufHdr)
+{
+   uint32 buf_state;
+   bool did_write = false;
+
+   ReservePrivateRefCountEntry();
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+   buf_state = LockBufHdr(bufHdr);
+
+   if ((buf_state & BM_VALID) && (buf_state & BM_DIRTY))
+   {
+       LWLock *content_lock;
+       PgAioInProgress *aio;
+
+       PinBuffer_Locked(bufHdr);
+
+       content_lock = BufferDescriptorGetContentLock(bufHdr);
+
+       aio = pg_streaming_write_get_io(pgsw);
+
+       /*
+        * If there are pre-existing IOs in-flight, we can't block on the
+        * content lock, it could lead to a deadlock. So first wait for
+        * outstanding IO, and then block on acquiring the lock.
+        */
+       if (pg_streaming_write_inflight(pgsw) > 0 &&
+           LWLockConditionalAcquire(content_lock, LW_SHARED))
+       {
+       }
+       else
+       {
+           pg_streaming_write_wait_all(pgsw);
+           LWLockAcquire(content_lock, LW_SHARED);
+       }
+
+       if (AsyncFlushBuffer(aio, bufHdr, NULL))
+       {
+           pg_streaming_write_write(pgsw, aio, bufHdr);
+           BgWriterStats.m_buf_written_checkpoints++;
+           did_write = true;
+       }
+       else
+       {
+           LWLockRelease(content_lock);
+           UnpinBuffer(bufHdr, true);
+       }
+   }
+   else
+   {
+       UnlockBufHdr(bufHdr, buf_state);
+   }
+
+   return did_write;
+}
+
 /*
  * BufferSync -- Write out all dirty buffers in the pool.
  *
@@ -1846,6 +2581,7 @@ BufferSync(int flags)
    binaryheap *ts_heap;
    int         i;
    int         mask = BM_DIRTY;
+   pg_streaming_write *pgsw;
    WritebackContext wb_context;
 
    /* Make sure we can handle the pin inside SyncOneBuffer */
@@ -1860,6 +2596,8 @@ BufferSync(int flags)
                    CHECKPOINT_FLUSH_ALL))))
        mask |= BM_PERMANENT;
 
+   elog(DEBUG1, "checkpoint looking at buffers");
+
    /*
     * Loop over all buffers, and mark the ones that need to be written with
     * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
@@ -1913,8 +2651,12 @@ BufferSync(int flags)
 
    WritebackContextInit(&wb_context, &checkpoint_flush_after);
 
+   pgsw = pg_streaming_write_alloc(128, &wb_context, buffer_sync_complete);
+
    TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
 
+   elog(DEBUG1, "checkpoint predicts to write %u buffers", num_to_scan);
+
    /*
     * Sort buffers that need to be written to reduce the likelihood of random
     * IO. The sorting is also important for the implementation of balancing
@@ -1927,6 +2669,8 @@ BufferSync(int flags)
 
    num_spaces = 0;
 
+   elog(DEBUG1, "checkpoint done sorting");
+
    /*
     * Allocate progress status for each tablespace with buffers that need to
     * be flushed. This requires the to-be-flushed array to be sorted.
@@ -2012,6 +2756,8 @@ BufferSync(int flags)
 
    binaryheap_build(ts_heap);
 
+   elog(DEBUG1, "checkpoint done heaping");
+
    /*
     * Iterate through to-be-checkpointed buffers and write the ones (still)
     * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
@@ -2047,7 +2793,7 @@ BufferSync(int flags)
         */
        if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
        {
-           if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
+           if (BufferSyncWriteOne(pgsw, bufHdr))
            {
                TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
                BgWriterStats.m_buf_written_checkpoints++;
@@ -2082,6 +2828,9 @@ BufferSync(int flags)
        CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
    }
 
+   pg_streaming_write_wait_all(pgsw);
+   pg_streaming_write_free(pgsw);
+
    /* issue all pending flushes */
    IssuePendingWritebacks(&wb_context);
 
@@ -2153,6 +2902,8 @@ BgBufferSync(WritebackContext *wb_context)
    long        new_strategy_delta;
    uint32      new_recent_alloc;
 
+   pg_streaming_write *pgsw;
+
    /*
     * Find out where the freelist clock sweep currently is, and how many
     * buffer allocations have happened since our last call.
@@ -2318,6 +3069,8 @@ BgBufferSync(WritebackContext *wb_context)
        upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
    }
 
+   pgsw = pg_streaming_write_alloc(128, wb_context, buffer_sync_complete);
+
    /*
     * Now write out dirty reusable buffers, working forward from the
     * next_to_clean point, until we have lapped the strategy scan, or cleaned
@@ -2335,8 +3088,8 @@ BgBufferSync(WritebackContext *wb_context)
    /* Execute the LRU scan */
    while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
    {
-       int         sync_state = SyncOneBuffer(next_to_clean, true,
-                                              wb_context);
+       int         sync_state =
+           BgBufferSyncWriteOne(next_to_clean, true, pgsw);
 
        if (++next_to_clean >= NBuffers)
        {
@@ -2358,6 +3111,9 @@ BgBufferSync(WritebackContext *wb_context)
            reusable_buffers++;
    }
 
+   pg_streaming_write_wait_all(pgsw);
+   pg_streaming_write_free(pgsw);
+
    BgWriterStats.m_buf_written_clean += num_written;
 
 #ifdef BGW_DEBUG
@@ -2413,14 +3169,17 @@ BgBufferSync(WritebackContext *wb_context)
  * Note: caller must have done ResourceOwnerEnlargeBuffers.
  */
 static int
-SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
+BgBufferSyncWriteOne(int buf_id, bool skip_recently_used,
+                    pg_streaming_write *pgsw)
 {
    BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    int         result = 0;
    uint32      buf_state;
-   BufferTag   tag;
+   LWLock *content_lock;
+   PgAioInProgress *aio;
 
    ReservePrivateRefCountEntry();
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
    /*
     * Check whether buffer needs writing.
@@ -2452,22 +3211,37 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
        return result;
    }
 
-   /*
-    * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
-    * buffer is clean by the time we've locked it.)
-    */
    PinBuffer_Locked(bufHdr);
-   LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
-
-   FlushBuffer(bufHdr, NULL);
 
-   LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+   aio = pg_streaming_write_get_io(pgsw);
 
-   tag = bufHdr->tag;
+   content_lock = BufferDescriptorGetContentLock(bufHdr);
 
-   UnpinBuffer(bufHdr, true);
+   /*
+    * If there are pre-existing IOs in-flight, we can't block on the
+    * content lock, it could lead to a deadlock. So first wait for
+    * outstanding IO, and then block on acquiring the lock.
+    */
+   if (pg_streaming_write_inflight(pgsw) > 0 &&
+       LWLockConditionalAcquire(content_lock, LW_SHARED))
+   {
+   }
+   else
+   {
+       pg_streaming_write_wait_all(pgsw);
+       LWLockAcquire(content_lock, LW_SHARED);
+   }
 
-   ScheduleBufferTagForWriteback(wb_context, &tag);
+   if (AsyncFlushBuffer(aio, bufHdr, NULL))
+   {
+       pg_streaming_write_write(pgsw, aio, bufHdr);
+       result |= BUF_WRITTEN;
+   }
+   else
+   {
+       LWLockRelease(content_lock);
+       UnpinBuffer(bufHdr, true);
+   }
 
    return result | BUF_WRITTEN;
 }
@@ -2584,7 +3358,6 @@ CheckForBufferLeaks(void)
            PrintBufferLeakWarning(res->buffer);
            RefCountErrors++;
        }
-
    }
 
    Assert(RefCountErrors == 0);
@@ -2731,6 +3504,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
    Block       bufBlock;
    char       *bufToWrite;
    uint32      buf_state;
+   PgAioBounceBuffer *bb;
 
    /*
     * Try to start an I/O operation.  If StartBufferIO returns false, then
@@ -2788,6 +3562,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
    if (buf_state & BM_PERMANENT)
        XLogFlush(recptr);
 
+   pgBufferUsage.shared_blks_written++;
+
    /*
     * Now it's safe to write buffer to disk. Note that no one else should
     * have been able to write it while we were busy with log flushing because
@@ -2800,35 +3576,68 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
     * buffer, other processes might be updating hint bits in it, so we must
     * copy the page to private storage if we do checksumming.
     */
-   bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
+   bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum,
+                                    io_data_force_async ? &bb : NULL);
 
-   if (track_io_timing)
-       INSTR_TIME_SET_CURRENT(io_start);
+   if (!io_data_force_async)
+   {
+       if (track_io_timing)
+           INSTR_TIME_SET_CURRENT(io_start);
 
-   /*
-    * bufToWrite is either the shared buffer or a copy, as appropriate.
-    */
-   smgrwrite(reln,
-             buf->tag.forkNum,
-             buf->tag.blockNum,
-             bufToWrite,
-             false);
+       /*
+        * bufToWrite is either the shared buffer or a copy, as appropriate.
+        */
+       smgrwrite(reln,
+                 buf->tag.forkNum,
+                 buf->tag.blockNum,
+                 bufToWrite,
+                 false);
 
-   if (track_io_timing)
-   {
-       INSTR_TIME_SET_CURRENT(io_time);
-       INSTR_TIME_SUBTRACT(io_time, io_start);
-       pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-       INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+       if (track_io_timing)
+       {
+           INSTR_TIME_SET_CURRENT(io_time);
+           INSTR_TIME_SUBTRACT(io_time, io_start);
+           pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+           INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+       }
+
+       /*
+        * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
+        * end the BM_IO_IN_PROGRESS state.
+        */
+       TerminateSharedBufferIO(buf, /* syncio = */ true, /* clear_dirty = */ true, 0);
    }
+   else
+   {
+       PgAioInProgress *aio = pgaio_io_get();
+       uint32      buf_state;
 
-   pgBufferUsage.shared_blks_written++;
+       buf_state = LockBufHdr(buf);
+       buf_state += BUF_REFCOUNT_ONE;
+       UnlockBufHdr(buf, buf_state);
 
-   /*
-    * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
-    * end the BM_IO_IN_PROGRESS state.
-    */
-   TerminateBufferIO(buf, true, 0);
+       /* FIXME: improve */
+       InProgressBuf = NULL;
+
+       if (bb)
+       {
+           pgaio_assoc_bounce_buffer(aio, bb);
+           pgaio_bounce_buffer_release(bb);
+       }
+
+       pgaio_io_ref(aio, &buf->io_in_progress);
+
+       smgrstartwrite(aio,
+                      reln,
+                      buf->tag.forkNum,
+                      buf->tag.blockNum,
+                      bufToWrite,
+                      BufferDescriptorGetBuffer(buf),
+                      /* skipFsync = */ false,
+                      /* release_lock = */ false);
+       pgaio_io_wait(aio);
+       pgaio_io_release(aio);
+   }
 
    TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
                                       buf->tag.blockNum,
@@ -2840,6 +3649,100 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
    error_context_stack = errcallback.previous;
 }
 
+static bool
+AsyncFlushBuffer(PgAioInProgress *aio, BufferDesc *buf, SMgrRelation reln)
+{
+   XLogRecPtr  recptr;
+   Block       bufBlock;
+   char       *bufToWrite;
+   uint32      buf_state;
+   PgAioBounceBuffer *bb;
+
+   Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), LW_SHARED));
+
+   /*
+    * Try to start an I/O operation.  If StartBufferIO returns false, then
+    * someone else flushed the buffer before we could, so we need not do
+    * anything.
+    */
+   if (!StartBufferIO(buf, false))
+       return false;
+
+   /* Find smgr relation for buffer */
+   if (reln == NULL)
+       reln = smgropen(buf->tag.rnode, InvalidBackendId);
+
+   buf_state = LockBufHdr(buf);
+
+   /*
+    * Run PageGetLSN while holding header lock, since we don't have the
+    * buffer locked exclusively in all cases.
+    */
+   recptr = BufferGetLSN(buf);
+
+   /* To check if block content changes while flushing. - vadim 01/17/97 */
+   buf_state &= ~BM_JUST_DIRTIED;
+
+   /* ownership while in AIO subsystem */
+   buf_state += BUF_REFCOUNT_ONE;
+
+   UnlockBufHdr(buf, buf_state);
+
+   if (buf_state & BM_PERMANENT)
+       XLogFlush(recptr);
+
+   bufBlock = BufHdrGetBlock(buf);
+
+   bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum, &bb);
+
+   /* FIXME: improve */
+   InProgressBuf = NULL;
+
+   pgBufferUsage.shared_blks_written++;
+
+   if (bb)
+   {
+       pgaio_assoc_bounce_buffer(aio, bb);
+       pgaio_bounce_buffer_release(bb);
+   }
+
+   pgaio_io_ref(aio, &buf->io_in_progress);
+
+   /*
+    * Ask the completion routine to release the lock for us. That's important
+    * for two reasons:
+    *
+    * 1) It allows other backends to move the block into a modifyable state
+    *    by completing the IO, avoiding some deadlock risks. Otherwise this
+    *    process would need to ensure the IO is completed before being
+    *    allowed to sleep.
+    *
+    * 2) For processes doing lots of streaming writes (e.g. checkpointer) the
+    *    lwlock ownership management turns out to be very expensive, because
+    *    lwlocks for locks aquired earlier are also likely to be released
+    *    earlier, leading to held_lwlocks needing to be shifted around.
+    *
+    * This is safe because we only release the lock ownership once the AIO
+    * subsystem successfully started tracking the IO.
+    */
+
+   smgrstartwrite(aio, reln,
+                  buf->tag.forkNum, buf->tag.blockNum,
+                  bufToWrite,
+                  BufferDescriptorGetBuffer(buf),
+                  /* skipFsync = */ false,
+                  /* release_lock = */ true);
+
+   /*
+    * XXX: The lock ownership release should probably be moved into the AIO
+    * layer, so we correctly handle errors happening during IO submission.
+    */
+   LWLockReleaseOwnership(BufferDescriptorGetContentLock(buf));
+   RESUME_INTERRUPTS();
+
+   return true;
+}
+
 /*
  * RelationGetNumberOfBlocksInFork
  *     Determines the current number of pages in the specified relation fork.
@@ -4104,6 +5007,7 @@ WaitIO(BufferDesc *buf)
    for (;;)
    {
        uint32      buf_state;
+       PgAioIoRef  aio_ref;
 
        /*
         * It may not be necessary to acquire the spinlock to check the flag
@@ -4111,10 +5015,19 @@ WaitIO(BufferDesc *buf)
         * play it safe.
         */
        buf_state = LockBufHdr(buf);
+       aio_ref = buf->io_in_progress;
        UnlockBufHdr(buf, buf_state);
 
        if (!(buf_state & BM_IO_IN_PROGRESS))
            break;
+
+       if (pgaio_io_ref_valid(&aio_ref))
+       {
+           pgaio_io_wait_ref(&aio_ref, true);
+           ConditionVariablePrepareToSleep(cv);
+           continue;
+       }
+
        ConditionVariableSleep(cv, WAIT_EVENT_BUFFILE_WAITIO);
    }
    ConditionVariableCancelSleep();
@@ -4173,8 +5086,26 @@ StartBufferIO(BufferDesc *buf, bool forInput)
    return true;
 }
 
+static void
+TerminateBufferIO(BufferDesc *bufHdr, bool local, bool syncio,
+                 bool clear_dirty, uint32 set_flag_bits)
+{
+   if (local)
+   {
+       /* Only need to adjust flags */
+       uint32      buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+       buf_state |= set_flag_bits;
+       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+   }
+   else
+   {
+       TerminateSharedBufferIO(bufHdr, syncio, clear_dirty, set_flag_bits);
+   }
+}
+
 /*
- * TerminateBufferIO: release a buffer we were doing I/O on
+ * TerminateSharedBufferIO: release a buffer we were doing I/O on
  * (Assumptions)
  * My process is executing IO for the buffer
  * BM_IO_IN_PROGRESS bit is set for the buffer
@@ -4190,11 +5121,12 @@ StartBufferIO(BufferDesc *buf, bool forInput)
  * be 0, or BM_VALID if we just finished reading in the page.
  */
 static void
-TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
+TerminateSharedBufferIO(BufferDesc *buf, bool syncio, bool clear_dirty, uint32 set_flag_bits)
 {
    uint32      buf_state;
 
-   Assert(buf == InProgressBuf);
+   if (syncio)
+       Assert(buf == InProgressBuf);
 
    buf_state = LockBufHdr(buf);
 
@@ -4205,11 +5137,46 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
        buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
 
    buf_state |= set_flag_bits;
+
+   if (!syncio)
+   {
+       buf_state -= BUF_REFCOUNT_ONE;
+       pgaio_io_ref_clear(&buf->io_in_progress);
+   }
+
    UnlockBufHdr(buf, buf_state);
 
-   InProgressBuf = NULL;
+   if (syncio)
+       InProgressBuf = NULL;
 
    ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
+
+
+   /* Support LockBufferForCleanup() */
+   if (!syncio && buf_state & BM_PIN_COUNT_WAITER)
+   {
+       /*
+        * Acquire the buffer header lock, re-check that there's a waiter.
+        * Another backend could have unpinned this buffer, and already
+        * woken up the waiter.  There's no danger of the buffer being
+        * replaced after we unpinned it above, as it's pinned by the
+        * waiter.
+        */
+       buf_state = LockBufHdr(buf);
+
+       if ((buf_state & BM_PIN_COUNT_WAITER) &&
+           BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+       {
+           /* we just released the last pin other than the waiter's */
+           int         wait_backend_pid = buf->wait_backend_pid;
+
+           buf_state &= ~BM_PIN_COUNT_WAITER;
+           UnlockBufHdr(buf, buf_state);
+           ProcSendSignal(wait_backend_pid);
+       }
+       else
+           UnlockBufHdr(buf, buf_state);
+   }
 }
 
 /*
@@ -4226,6 +5193,8 @@ AbortBufferIO(void)
 {
    BufferDesc *buf = InProgressBuf;
 
+   pgaio_at_abort();
+
    if (buf)
    {
        uint32      buf_state;
@@ -4259,7 +5228,7 @@ AbortBufferIO(void)
                pfree(path);
            }
        }
-       TerminateBufferIO(buf, false, BM_IO_ERROR);
+       TerminateSharedBufferIO(buf, /* syncio = */ true, /* clear_dirty = */ false, BM_IO_ERROR);
    }
 }
 
@@ -4490,6 +5459,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
 {
    PendingWriteback *pending;
 
+   if (io_data_direct)
+       return;
+
    /*
     * Add buffer to the pending writeback array, unless writeback control is
     * disabled.
@@ -4584,6 +5556,9 @@ IssuePendingWritebacks(WritebackContext *context)
    }
 
    context->nr_pending = 0;
+
+   if (i > 0)
+       pgaio_submit_pending(true);
 }
 
 
index eddd0782f298e6bbe486a6508196627b7b8d19e6..28e4b8155e79c9f4fb88ba5ed3aaf62be01c1f12 100644 (file)
@@ -117,6 +117,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
    bool        found;
    uint32      buf_state;
 
+   Assert(blockNum != P_NEW);
+
    INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
    /* Initialize local buffers if first request in this session */
index 931ed679307b223c61ff7da092a0de88b8560250..cbb10ded4ba3551fac7668cf416ee171dd2d74e6 100644 (file)
@@ -93,6 +93,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "portability/mem.h"
+#include "storage/aio.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "utils/guc.h"
@@ -468,6 +469,15 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
     * We compile all alternatives that are supported on the current platform,
     * to find portability problems more easily.
     */
+#if USE_LIBURING
+   {
+       PgAioInProgress *aio = pgaio_io_get();
+
+       pgaio_io_start_flush_range(aio, fd, offset, nbytes);
+       pgaio_io_release(aio);
+       return;
+   }
+#endif
 #if defined(HAVE_SYNC_FILE_RANGE)
    {
        int         rc;
@@ -1040,7 +1050,11 @@ tryAgain:
    fd = open(fileName, fileFlags, fileMode);
 
    if (fd >= 0)
+   {
+       //elog(DEBUG1, "opening file %s fd %d", fileName, fd);
+
        return fd;              /* success! */
+   }
 
    if (errno == EMFILE || errno == ENFILE)
    {
@@ -1184,6 +1198,8 @@ LruDelete(File file)
 
    vfdP = &VfdCache[file];
 
+   pgaio_submit_pending(false);
+
    /*
     * Close the file.  We aren't expecting this to fail; if it does, better
     * to leak the FD than to mess up our internal state.
@@ -1863,6 +1879,8 @@ FileClose(File file)
 
    if (!FileIsNotOpen(file))
    {
+       pgaio_submit_pending(false);
+
        /* close the file */
        if (close(vfdP->fd) != 0)
        {
@@ -1991,6 +2009,10 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
    if (nbytes <= 0)
        return;
 
+   /* no point */
+   if (VfdCache[file].fileFlags & O_DIRECT)
+       return;
+
    returnCode = FileAccess(file);
    if (returnCode < 0)
        return;
@@ -2056,6 +2078,30 @@ retry:
    return returnCode;
 }
 
+bool
+FileStartRead(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const AioBufferTag *tag, int bufid, int mode)
+{
+   int         returnCode;
+   Vfd        *vfdP;
+
+   Assert(FileIsValid(file));
+
+   DO_DB(elog(LOG, "FileStartRead: %d (%s) " INT64_FORMAT " %d %p",
+              file, VfdCache[file].fileName,
+              (int64) offset,
+              amount, buffer));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return false;
+
+   vfdP = &VfdCache[file];
+
+   pgaio_io_start_read_buffer(io, tag, vfdP->fd, offset, amount, buffer, bufid, mode);
+
+   return true;
+}
+
 int
 FileWrite(File file, char *buffer, int amount, off_t offset,
          uint32 wait_event_info)
@@ -2154,6 +2200,30 @@ retry:
    return returnCode;
 }
 
+bool
+FileStartWrite(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const AioBufferTag *tag, int bufid, bool release_lock)
+{
+   int         returnCode;
+   Vfd        *vfdP;
+
+   Assert(FileIsValid(file));
+
+   DO_DB(elog(LOG, "FileStartWrite: %d (%s) " INT64_FORMAT " %d %p",
+              file, VfdCache[file].fileName,
+              (int64) offset,
+              amount, buffer));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return false;
+
+   vfdP = &VfdCache[file];
+
+   pgaio_io_start_write_buffer(io, tag, vfdP->fd, offset, amount, buffer, bufid, release_lock);
+
+   return true;
+}
+
 int
 FileSync(File file, uint32 wait_event_info)
 {
@@ -2246,7 +2316,14 @@ FilePathName(File file)
 int
 FileGetRawDesc(File file)
 {
+   int         returnCode;
+
    Assert(FileIsValid(file));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return returnCode;
+
    return VfdCache[file].fd;
 }
 
@@ -2527,6 +2604,7 @@ FreeDesc(AllocateDesc *desc)
            result = closedir(desc->desc.dir);
            break;
        case AllocateDescRawFD:
+           pgaio_submit_pending(false);
            result = close(desc->desc.fd);
            break;
        default:
@@ -2595,6 +2673,8 @@ CloseTransientFile(int fd)
    /* Only get here if someone passes us a file not in allocatedDescs */
    elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
 
+   pgaio_submit_pending(false);
+
    return close(fd);
 }
 
index df90c6b093fd7b826d34108946bf75e1640db2ae..ab0933028215085cdf77149763f52d27ebd18acd 100644 (file)
@@ -9,6 +9,8 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = \
+   aio.o \
+   aio_util.o \
    barrier.o \
    dsm.o \
    dsm_impl.o \
@@ -27,4 +29,7 @@ OBJS = \
    sinvaladt.o \
    standby.o
 
+aio.o: override CPPFLAGS += $(LIBURING_CFLAGS)
+aio.bc: override CPPFLAGS += $(LIBURING_CFLAGS)
+
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/ipc/aio.c b/src/backend/storage/ipc/aio.c
new file mode 100644 (file)
index 0000000..b7374ac
--- /dev/null
@@ -0,0 +1,4069 @@
+/*
+ * Big picture changes:
+ * - backend local recycleable IOs
+ * - merging of IOs when submitting individual IOs, not when submitting all pending IOs
+ * - reorganization of shared callback system, so there's an underlying
+ *   "write" operation that's used both by WAL, generic, ...  writes.
+ * - Consider not exposing PgAioInProgress* at all, instead expose a PgAioReference { uint32 io; uint64 generation; }
+ *   which would make it a lot less problematic to immediate reuse IOs.
+ * - Shrink size of PgAioInProgress
+ * - refcount bounc buffers / redesign
+ * - get rid of the current backpressure logic
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <liburing.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "lib/ilist.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "nodes/memnodes.h"
+#include "pgstat.h"
+#include "access/xlog_internal.h"
+#include "storage/aio.h"
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+#include "utils/fmgrprotos.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+
+#define PGAIO_VERBOSE
+
+
+/*
+ * FIXME: This is just so large because merging happens when submitting
+ * pending requests, rather than when staging them.
+ */
+#define PGAIO_SUBMIT_BATCH_SIZE 256
+#define PGAIO_MAX_LOCAL_REAPED 128
+#define PGAIO_MAX_COMBINE 16
+
+#define PGAIO_NUM_CONTEXTS 8
+
+/*
+ * The type of AIO.
+ *
+ * To keep PgAioInProgress smaller try to tell the compiler to only use the
+ * minimal space. We could alternatively just use a uint8, but then we'd need
+ * casts in more places...
+ */
+typedef enum
+#ifdef pg_attribute_packed
+pg_attribute_packed()
+#endif
+   PgAioAction
+{
+   /* intentionally the zero value, to help catch zeroed memory etc */
+   PGAIO_INVALID = 0,
+
+   PGAIO_NOP,
+   /* FIXME: unify */
+   PGAIO_FSYNC,
+   PGAIO_FSYNC_WAL,
+   PGAIO_FLUSH_RANGE,
+
+   PGAIO_READ_BUFFER,
+   /* FIXME: unify */
+   PGAIO_WRITE_BUFFER,
+   PGAIO_WRITE_WAL,
+   PGAIO_WRITE_GENERIC,
+} PgAioAction;
+
+typedef enum PgAioInProgressFlags
+{
+   /* request in the ->unused list */
+   PGAIOIP_UNUSED = 1 << 0,
+
+   /*  */
+   PGAIOIP_IDLE = 1 << 1,
+
+   /*  */
+   PGAIOIP_IN_PROGRESS = 1 << 2,
+
+   /* somewhere */
+   PGAIOIP_PENDING = 1 << 3,
+
+   /* request in kernel */
+   PGAIOIP_INFLIGHT = 1 << 4,
+
+   /* request reaped */
+   PGAIOIP_REAPED = 1 << 5,
+
+   /* shared completion callback was called */
+   PGAIOIP_SHARED_CALLBACK_CALLED = 1 << 6,
+
+   /* completed */
+   PGAIOIP_DONE = 1 << 7,
+
+   PGAIOIP_FOREIGN_DONE = 1 << 8,
+
+   /* IO is merged with others */
+   PGAIOIP_MERGE = 1 << 9,
+
+   PGAIOIP_RETRY = 1 << 10,
+
+   /* request failed completely */
+   PGAIOIP_HARD_FAILURE = 1 << 11,
+
+   /* request failed partly, e.g. a short write */
+   PGAIOIP_SOFT_FAILURE = 1 << 12,
+
+   PGAIOIP_SHARED_FAILED = 1 << 13,
+
+   /* local completion callback was called */
+   PGAIOIP_LOCAL_CALLBACK_CALLED = 1 << 14,
+
+} PgAioInProgressFlags;
+
+/* IO completion callback */
+typedef bool (*PgAioCompletedCB)(PgAioInProgress *io);
+
+typedef uint16 PgAioIPFlags;
+
+struct PgAioInProgress
+{
+   /* PgAioAction, indexes PgAioCompletionCallbacks */
+   PgAioAction type;
+
+   PgAioIPFlags flags;
+
+   bool user_referenced;
+   bool system_referenced;
+
+   /* which AIO ring is this entry active for */
+   uint8 ring;
+
+   /* index into allProcs, or PG_UINT32_MAX for process local IO */
+   uint32 owner_id;
+
+   /* the IOs result, depends on operation. E.g. the length of a read */
+   int32 result;
+
+   /*
+    * Single callback that can be registered on an IO to be called upon
+    * completion. Note that this is reset whenever an IO is recycled..
+    */
+   PgAioOnCompletionLocalContext *on_completion_local;
+
+   /*
+    * Membership in one of
+    * PgAioCtl->unused,
+    * PgAioPerBackend->unused,
+    * PgAioPerBackend->outstanding,
+    * PgAioPerBackend->issued,
+    */
+   dlist_node owner_node;
+
+   /*
+    * Membership in
+    * PgAioPerBackend->pending,
+    * PgAioPerBackend->reaped,
+    * local_recycle_requests
+    * PgAioPerBackend->foreign_completed,
+    * PgAioPerBackend->local_completed
+    */
+   dlist_node io_node;
+
+   ConditionVariable cv;
+
+   /* index into context->iovec, or -1 */
+   int32 used_iovec;
+
+   PgAioBounceBuffer *bb;
+
+   PgAioInProgress *merge_with;
+
+   uint64 generation;
+
+   /*
+    * NB: Note that fds in here may *not* be relied upon for re-issuing
+    * requests (e.g. for partial reads/writes) - the fd might be from another
+    * process, or closed since. That's not a problem for IOs waiting to be
+    * issued only because the queue is flushed when closing an fd.
+    */
+   union {
+       struct
+       {
+           int fd;
+           bool barrier;
+           bool datasync;
+       } fsync;
+
+       struct
+       {
+           int fd;
+           bool barrier;
+           bool datasync;
+           uint32 flush_no;
+       } fsync_wal;
+
+       struct
+       {
+           int fd;
+           uint64 nbytes;
+           uint32 offset;
+       } flush_range;
+
+       struct
+       {
+           uint32 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           int fd;
+           char *bufdata;
+           Buffer buf;
+           AioBufferTag tag;
+           int mode;
+       } read_buffer;
+
+       struct
+       {
+           uint32 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           int fd;
+           char *bufdata;
+           Buffer buf;
+           bool release_lock;
+           AioBufferTag tag;
+       } write_buffer;
+
+       struct
+       {
+           int fd;
+           uint32 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           char *bufdata;
+           bool no_reorder;
+           uint32 write_no;
+       } write_wal;
+
+       struct
+       {
+           int fd;
+           uint64 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           char *bufdata;
+           bool no_reorder;
+       } write_generic;
+   } d;
+};
+
+/* typedef in header */
+struct PgAioBounceBuffer
+{
+   pg_atomic_uint32 refcount;
+   dlist_node node;
+   char *buffer;
+};
+
+/*
+ * An iovec that can represent the biggest possible iovec (due to combining)
+ * we may need for a single IO submission.
+ */
+typedef struct PgAioIovec
+{
+   slist_node node;
+   struct iovec iovec[PGAIO_MAX_COMBINE];
+} PgAioIovec;
+
+
+/*
+ * XXX: Really want a proclist like structure that works with integer
+ * offsets. Given the limited number of IOs ever existing, using full pointers
+ * is completely unnecessary.
+ */
+
+typedef struct PgAioPerBackend
+{
+   uint32 last_context;
+
+   /*
+    * Local unused IOs. There's only a limited number of these. Used to
+    * reduce overhead of the central unused list.
+    *
+    * FIXME: Actually use.
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head unused;
+   uint32 unused_count;
+
+   /*
+    * IOs handed out to code within the backend.
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head outstanding;
+   uint32 outstanding_count;
+
+   /*
+    * Requests waiting to be issued to the kernel. They are submitted to the
+    * kernel in batches, for efficiency (local merging of IOs, and better
+    * kernel side queue processing).
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->io_node
+    */
+   dlist_head pending;
+   uint32 pending_count;
+
+   /*
+    * Requests issued by backend that have not yet completed yet (but may be
+    * foreign_completed) and are still referenced by backend code (see
+    * issued_abandoned for those).
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head issued;
+   uint32 issued_count;
+
+   /*
+    * Requests issued by backend that have not yet completed yet (but may be
+    * foreign_completed) and that are not referenced by backend code anymore (see
+    * issued for those).
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head issued_abandoned;
+   uint32 issued_abandoned_count;
+
+   /*
+    * PgAioInProgress that are issued to the ringbuffer, and have not yet
+    * been processed (but they may have completed without the completions
+    * having been processed).
+    */
+   pg_atomic_uint32 inflight_count;
+
+   /*
+    * Requests where we've received a kernel completion, but haven't yet
+    * processed them.  This is needed to handle failing callbacks.
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->io_node
+    */
+   dlist_head reaped;
+
+   /*
+    * IOs that were completed, but not yet recycled.
+    *
+    * PgAioInProgress->io_node
+    */
+   dlist_head local_completed;
+   uint32 local_completed_count;
+
+   /*
+    * IOs where the completion was received in another backend.
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->io_node
+    */
+   slock_t foreign_completed_lock;
+   uint32 foreign_completed_count;
+   dlist_head foreign_completed;
+
+   /*
+    * Stats.
+    */
+   uint64 executed_total_count; /* un-merged */
+   uint64 issued_total_count; /* merged */
+   uint64 submissions_total_count; /* number of submission syscalls */
+   uint64 foreign_completed_total_count;
+   uint64 retry_total_count;
+
+} PgAioPerBackend;
+
+typedef struct PgAioContext
+{
+   LWLock submission_lock;
+   LWLock completion_lock;
+
+   struct io_uring io_uring_ring;
+
+   /*
+    * For many versions of io_uring iovecs need to be in shared memory. The
+    * lists of available iovecs are split to be under the submission /
+    * completion locks - that allows to avoid additional lock acquisitions in
+    * the common cases.
+    */
+   PgAioIovec *iovecs;
+
+   /* locked by submission lock */
+   slist_head unused_iovecs;
+   uint32 unused_iovecs_count;
+
+   /* locked by completion lock */
+   slist_head reaped_iovecs;
+   uint32 reaped_iovecs_count;
+
+   /* XXX: probably worth padding to a cacheline boundary here */
+} PgAioContext;
+
+typedef struct PgAioCtl
+{
+   /* PgAioInProgress that are not used */
+   dlist_head unused_ios;
+
+   /*
+    * Number of PgAioInProgressIOs that are in use. This includes pending
+    * requests, as well as requests actually issues to the queue.
+    *
+    * Protected by SharedAIOCtlLock.
+    */
+   uint32 used_count;
+
+   /*
+    * Protected by SharedAIOCtlLock.
+    */
+   dlist_head reaped_uncompleted;
+
+   PgAioBounceBuffer *bounce_buffers;
+   dlist_head unused_bounce_buffers;
+   uint32 unused_bounce_buffers_count;
+
+   int backend_state_count;
+   PgAioPerBackend *backend_state;
+
+   uint32 num_contexts;
+   PgAioContext *contexts;
+
+   PgAioInProgress in_progress_io[FLEXIBLE_ARRAY_MEMBER];
+} PgAioCtl;
+
+/* general pgaio helper functions */
+static void pgaio_complete_ios(bool in_error);
+static void pgaio_apply_backend_limit(void);
+static void pgaio_prepare_io(PgAioInProgress *io, PgAioAction action);
+static void pgaio_finish_io(PgAioInProgress *io);
+static void pgaio_bounce_buffer_release_internal(PgAioBounceBuffer *bb, bool holding_lock, bool release_resowner);
+static void pgaio_io_ref_internal(PgAioInProgress *io, PgAioIoRef *ref);
+
+/* io_uring related functions */
+static int pgaio_uring_submit(int max_submit, bool drain);
+static int pgaio_uring_drain(PgAioContext *context);
+static void pgaio_uring_wait_one(PgAioContext *context, PgAioInProgress *io, uint64 generation, uint32 wait_event_info);
+
+static void pgaio_uring_sq_from_io(PgAioContext *context, PgAioInProgress *io, struct io_uring_sqe *sqe);
+static void pgaio_uring_io_from_cqe(PgAioContext *context, struct io_uring_cqe *cqe);
+static void pgaio_uring_iovec_transfer(PgAioContext *context);
+
+static int __sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
+                               unsigned flags, sigset_t *sig);
+
+/* io completions */
+static bool pgaio_complete_nop(PgAioInProgress *io);
+static bool pgaio_complete_fsync(PgAioInProgress *io);
+static bool pgaio_complete_fsync_wal(PgAioInProgress *io);
+static bool pgaio_complete_flush_range(PgAioInProgress *io);
+static bool pgaio_complete_read_buffer(PgAioInProgress *io);
+static bool pgaio_complete_write_buffer(PgAioInProgress *io);
+static bool pgaio_complete_write_wal(PgAioInProgress *io);
+static bool pgaio_complete_write_generic(PgAioInProgress *io);
+
+
+static MemoryContext aio_retry_context;
+
+/*
+ * To support EXEC_BACKEND environments, where we cannot rely on callback
+ * addresses being equivalent across processes, completion actions are just
+ * indices into a process local array of callbacks, indexed by the type of
+ * action.  Also makes the shared memory entries a bit smaller, but that's not
+ * a huge win.
+ */
+static const PgAioCompletedCB completion_callbacks[] =
+{
+   [PGAIO_NOP] = pgaio_complete_nop,
+   [PGAIO_FSYNC] = pgaio_complete_fsync,
+   [PGAIO_FSYNC_WAL] = pgaio_complete_fsync_wal,
+   [PGAIO_FLUSH_RANGE] = pgaio_complete_flush_range,
+   [PGAIO_READ_BUFFER] = pgaio_complete_read_buffer,
+   [PGAIO_WRITE_BUFFER] = pgaio_complete_write_buffer,
+   [PGAIO_WRITE_WAL] = pgaio_complete_write_wal,
+   [PGAIO_WRITE_GENERIC] = pgaio_complete_write_generic,
+};
+
+
+/* (future) GUC controlling global MAX number of in-progress IO entries */
+/* FIXME: find a good naming pattern */
+extern int max_aio_in_progress;
+/* FIXME: this is per context right now */
+extern int max_aio_in_flight;
+extern int max_aio_bounce_buffers;
+
+/* max per backend concurrency */
+extern int io_max_concurrency;
+
+int max_aio_in_progress = 32768; /* XXX: Multiple of MaxBackends instead? */
+int max_aio_in_flight = 4096;
+int max_aio_bounce_buffers = 1024;
+int io_max_concurrency = 128;
+
+/* global list of in-progress IO */
+static PgAioCtl *aio_ctl;
+
+/* current backend's per-backend-state */
+static PgAioPerBackend *my_aio;
+static int my_aio_id;
+
+/* FIXME: move into PgAioPerBackend / subsume into ->reaped */
+static dlist_head local_recycle_requests;
+
+
+/* io_uring local state */
+struct io_uring local_ring;
+
+static Size
+AioCtlShmemSize(void)
+{
+   Size        sz;
+
+   /* aio_ctl itself */
+   sz = offsetof(PgAioCtl, in_progress_io);
+
+   /* ios */
+   sz = add_size(sz, mul_size(max_aio_in_progress, sizeof(PgAioInProgress)));
+
+   return sz;
+}
+
+static Size
+AioCtlBackendShmemSize(void)
+{
+   uint32      TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS;
+
+   return mul_size(TotalProcs, sizeof(PgAioPerBackend));
+}
+
+static Size
+AioBounceShmemSize(void)
+{
+   Size        sz;
+
+   /* PgAioBounceBuffer itself */
+   sz = mul_size(sizeof(PgAioBounceBuffer), max_aio_bounce_buffers);
+
+   /* and the associated buffer */
+   sz = add_size(sz,
+                 mul_size(BLCKSZ, add_size(max_aio_bounce_buffers, 1)));
+
+   return sz;
+}
+
+static Size
+AioContextShmemSize(void)
+{
+   return mul_size(PGAIO_NUM_CONTEXTS, sizeof(PgAioContext));
+}
+
+static Size
+AioContextIovecsShmemSize(void)
+{
+   return mul_size(PGAIO_NUM_CONTEXTS,
+                   mul_size(sizeof(PgAioIovec), max_aio_in_flight));
+}
+
+Size
+AioShmemSize(void)
+{
+   Size        sz = 0;
+
+   sz = add_size(sz, AioCtlShmemSize());
+   sz = add_size(sz, AioBounceShmemSize());
+   sz = add_size(sz, AioCtlBackendShmemSize());
+   sz = add_size(sz, AioContextShmemSize());
+   sz = add_size(sz, AioContextIovecsShmemSize());
+
+   return sz;
+}
+
+void
+AioShmemInit(void)
+{
+   bool        found;
+   uint32      TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS;
+
+   aio_ctl = (PgAioCtl *)
+       ShmemInitStruct("PgAio", AioCtlShmemSize(), &found);
+
+   if (!found)
+   {
+       memset(aio_ctl, 0, AioCtlShmemSize());
+
+       dlist_init(&aio_ctl->unused_ios);
+       dlist_init(&aio_ctl->reaped_uncompleted);
+
+       for (int i = 0; i < max_aio_in_progress; i++)
+       {
+           PgAioInProgress *io = &aio_ctl->in_progress_io[i];
+
+           ConditionVariableInit(&io->cv);
+           dlist_push_tail(&aio_ctl->unused_ios, &io->owner_node);
+           io->flags = PGAIOIP_UNUSED;
+           io->system_referenced = true;
+           io->generation = 1;
+       }
+
+       aio_ctl->backend_state_count = TotalProcs;
+       aio_ctl->backend_state = (PgAioPerBackend *)
+           ShmemInitStruct("PgAioBackend", AioCtlBackendShmemSize(), &found);
+       memset(aio_ctl->backend_state, 0, AioCtlBackendShmemSize());
+
+       for (int procno = 0; procno < TotalProcs; procno++)
+       {
+           PgAioPerBackend *bs = &aio_ctl->backend_state[procno];
+
+           dlist_init(&bs->unused);
+           dlist_init(&bs->outstanding);
+           dlist_init(&bs->pending);
+           dlist_init(&bs->issued);
+           dlist_init(&bs->issued_abandoned);
+           pg_atomic_init_u32(&bs->inflight_count, 0);
+           dlist_init(&bs->reaped);
+
+           dlist_init(&bs->foreign_completed);
+           SpinLockInit(&bs->foreign_completed_lock);
+       }
+
+       {
+           char *p;
+           char *blocks;
+
+           dlist_init(&aio_ctl->unused_bounce_buffers);
+           aio_ctl->bounce_buffers =
+               ShmemInitStruct("PgAioBounceBuffers",
+                               sizeof(PgAioBounceBuffer) * max_aio_bounce_buffers,
+                               &found);
+           Assert(!found);
+
+           p = ShmemInitStruct("PgAioBounceBufferBlocks",
+                               BLCKSZ * (max_aio_bounce_buffers + 1),
+                               &found);
+           Assert(!found);
+           blocks = (char *) TYPEALIGN(BLCKSZ, (uintptr_t) p);
+
+           for (int i = 0; i < max_aio_bounce_buffers; i++)
+           {
+               PgAioBounceBuffer *bb = &aio_ctl->bounce_buffers[i];
+
+               bb->buffer = blocks + i * BLCKSZ;
+               memset(bb->buffer, 0, BLCKSZ);
+               pg_atomic_init_u32(&bb->refcount, 0);
+               dlist_push_tail(&aio_ctl->unused_bounce_buffers, &bb->node);
+               aio_ctl->unused_bounce_buffers_count++;
+           }
+       }
+
+       {
+           PgAioIovec *iovecs;
+
+           aio_ctl->num_contexts = PGAIO_NUM_CONTEXTS;
+           aio_ctl->contexts = ShmemInitStruct("PgAioContexts", AioContextShmemSize(), &found);
+           Assert(!found);
+
+           iovecs = (PgAioIovec *)
+           ShmemInitStruct("PgAioContextsIovecs", AioContextIovecsShmemSize(), &found);
+           Assert(!found);
+           memset(iovecs, 0, AioContextIovecsShmemSize());
+
+           for (int contextno = 0; contextno < aio_ctl->num_contexts; contextno++)
+           {
+               PgAioContext *context = &aio_ctl->contexts[contextno];
+               int ret;
+
+               LWLockInitialize(&context->submission_lock, LWTRANCHE_AIO_CONTEXT_SUBMISSION);
+               LWLockInitialize(&context->completion_lock, LWTRANCHE_AIO_CONTEXT_COMPLETION);
+
+               slist_init(&context->unused_iovecs);
+               slist_init(&context->reaped_iovecs);
+
+               context->iovecs = iovecs;
+               iovecs += max_aio_in_flight;
+
+               for (uint32 i = 0; i < max_aio_in_flight; i++)
+               {
+                   slist_push_head(&context->unused_iovecs, &context->iovecs[i].node);
+                   context->unused_iovecs_count++;
+               }
+
+               /*
+                * XXX: Probably worth sharing the WQ between the different
+                * rings, when supported by the kernel. Could also cause
+                * additional contention, I guess?
+                */
+               if (!AcquireExternalFD())
+                   elog(ERROR, "io_uring_queue_init: %m");
+               ret = io_uring_queue_init(max_aio_in_flight, &context->io_uring_ring, 0);
+               if (ret < 0)
+                   elog(ERROR, "io_uring_queue_init failed: %s", strerror(-ret));
+           }
+       }
+
+   }
+}
+
+void
+pgaio_postmaster_init(void)
+{
+   /* FIXME: should also be allowed to use AIO */
+   dlist_init(&local_recycle_requests);
+
+   // XXX: could create a local queue here.
+
+   /*
+    * Need to be allowed to re-open files during retries. Those can happen,
+    * e.g. when fsyncing WAL, within a critical section. Reopening files
+    * currently requires memory. So create a context with small reservation
+    * that's allowed to be used within a critical section.
+    */
+   aio_retry_context = AllocSetContextCreate(TopMemoryContext,
+                                             "aio retry context",
+                                             1024,
+                                             1024,
+                                             1024);
+   MemoryContextAllowInCriticalSection(aio_retry_context, true);
+}
+
+void
+pgaio_postmaster_child_init_local(void)
+{
+   /*
+    *
+    */
+   {
+       int ret;
+
+       ret = io_uring_queue_init(32, &local_ring, 0);
+       if (ret < 0)
+       {
+           elog(ERROR, "io_uring_queue_init failed: %s", strerror(-ret));
+       }
+   }
+}
+
+static void
+pgaio_postmaster_before_child_exit(int code, Datum arg)
+{
+   elog(DEBUG2, "aio before shmem exit: start");
+
+   /*
+    * Need to wait for in-progress IOs initiated by this backend to
+    * finish. Some operating systems, like linux w/ io_uring, cancel IOs that
+    * are still in progress when exiting. Other's don't provide access to the
+    * results of such IOs.
+    */
+   while (!dlist_is_empty(&my_aio->issued))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+       pgaio_io_release(io);
+   }
+
+   Assert(my_aio->issued_count == 0);
+   Assert(dlist_is_empty(&my_aio->issued));
+
+   while (!dlist_is_empty(&my_aio->issued_abandoned))
+   {
+       PgAioInProgress *io = NULL;
+       PgAioIoRef ref;
+
+       LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+       if (!dlist_is_empty(&my_aio->issued_abandoned))
+       {
+           io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued_abandoned);
+           pgaio_io_ref_internal(io, &ref);
+       }
+       LWLockRelease(SharedAIOCtlLock);
+
+       if (!io)
+       {
+           elog(LOG, "skipped exit wait for abandoned IO %zu", io - aio_ctl->in_progress_io);
+           break;
+       }
+
+       elog(LOG, "exit wait for abandoned IO %zu", io - aio_ctl->in_progress_io);
+       pgaio_io_print(io, NULL);
+       pgaio_io_wait_ref(&ref, false);
+   }
+
+   elog(DEBUG2, "aio before shmem exit: end");
+}
+
+static void
+pgaio_postmaster_child_exit(int code, Datum arg)
+{
+   /* FIXME: handle unused */
+   Assert(my_aio->outstanding_count == 0);
+   Assert(dlist_is_empty(&my_aio->outstanding));
+
+   Assert(my_aio->pending_count == 0);
+   Assert(dlist_is_empty(&my_aio->pending));
+
+   Assert(my_aio->issued_count == 0);
+   Assert(dlist_is_empty(&my_aio->issued));
+
+   Assert(my_aio->issued_abandoned_count == 0);
+   Assert(dlist_is_empty(&my_aio->issued_abandoned));
+
+   Assert(pg_atomic_read_u32(&my_aio->inflight_count) == 0);
+
+   Assert(dlist_is_empty(&my_aio->reaped));
+
+   Assert(my_aio->local_completed_count == 0);
+   Assert(dlist_is_empty(&my_aio->local_completed));
+
+   Assert(my_aio->foreign_completed_count == 0);
+   Assert(dlist_is_empty(&my_aio->foreign_completed));
+}
+
+void
+pgaio_postmaster_child_init(void)
+{
+   /* no locking needed here, only affects this process */
+   for (int i = 0; i < aio_ctl->num_contexts; i++)
+       io_uring_ring_dontfork(&aio_ctl->contexts[i].io_uring_ring);
+
+   my_aio_id = MyProc->pgprocno;
+   my_aio = &aio_ctl->backend_state[my_aio_id];
+
+   dlist_init(&local_recycle_requests);
+
+   before_shmem_exit(pgaio_postmaster_before_child_exit, 0);
+   on_shmem_exit(pgaio_postmaster_child_exit, 0);
+
+   Assert(my_aio->unused_count == 0);
+   Assert(my_aio->outstanding_count == 0);
+   Assert(my_aio->issued_count == 0);
+   Assert(my_aio->issued_abandoned_count == 0);
+   Assert(my_aio->pending_count == 0);
+   Assert(my_aio->local_completed_count == 0);
+   Assert(my_aio->foreign_completed_count == 0);
+
+   /* try to spread out a bit from the start */
+   my_aio->last_context = MyProcPid % PGAIO_NUM_CONTEXTS;
+
+   my_aio->executed_total_count = 0;
+   my_aio->issued_total_count = 0;
+   my_aio->submissions_total_count = 0;
+   my_aio->foreign_completed_total_count = 0;
+   my_aio->retry_total_count = 0;
+}
+
+void
+pgaio_at_abort(void)
+{
+   pgaio_complete_ios(/* in_error = */ true);
+
+   pgaio_submit_pending(false);
+
+   while (!dlist_is_empty(&my_aio->outstanding))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->outstanding);
+
+       pgaio_io_release(io);
+   }
+
+   while (!dlist_is_empty(&my_aio->issued))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+       pgaio_io_release(io);
+   }
+}
+
+void
+pgaio_at_commit(void)
+{
+   Assert(dlist_is_empty(&local_recycle_requests));
+
+   if (my_aio->pending_count != 0)
+   {
+       elog(WARNING, "unsubmitted IOs %d", my_aio->pending_count);
+       pgaio_submit_pending(false);
+   }
+
+   while (!dlist_is_empty(&my_aio->outstanding))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->outstanding);
+
+       elog(WARNING, "leaked outstanding io %zu", io - aio_ctl->in_progress_io);
+
+       pgaio_io_release(io);
+   }
+
+   while (!dlist_is_empty(&my_aio->issued))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+       elog(WARNING, "leaked issued io %zu", io - aio_ctl->in_progress_io);
+
+       pgaio_io_release(io);
+   }
+}
+
+static int
+pgaio_uncombine_one(PgAioInProgress *io)
+{
+   int orig_result = io->result;
+   int running_result = orig_result;
+   PgAioInProgress *cur = io;
+   PgAioInProgress *last = NULL;
+   int extracted = 0;
+
+   while (cur)
+   {
+       PgAioInProgress *next = cur->merge_with;
+
+       Assert(!(cur->flags & PGAIOIP_SHARED_CALLBACK_CALLED));
+       Assert(cur->merge_with || cur != io);
+
+       switch (cur->type)
+       {
+           case PGAIO_READ_BUFFER:
+               Assert(cur->d.read_buffer.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.read_buffer.nbytes)
+               {
+                   cur->result = cur->d.read_buffer.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.read_buffer.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+
+               break;
+
+           case PGAIO_WRITE_BUFFER:
+               Assert(cur->d.write_buffer.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.write_buffer.nbytes)
+               {
+                   cur->result = cur->d.write_buffer.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.write_buffer.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+               break;
+
+           case PGAIO_WRITE_WAL:
+               Assert(cur->d.write_wal.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.write_wal.nbytes)
+               {
+                   cur->result = cur->d.write_wal.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.write_wal.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+               break;
+
+           case PGAIO_WRITE_GENERIC:
+               Assert(cur->d.write_generic.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.write_generic.nbytes)
+               {
+                   cur->result = cur->d.write_generic.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.write_generic.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+               break;
+
+           default:
+               elog(PANIC, "merge for %d not supported yet", cur->type);
+       }
+
+       cur->merge_with = NULL;
+
+       if (last)
+       {
+           cur->flags =
+               (cur->flags & ~(PGAIOIP_INFLIGHT |
+                               PGAIOIP_MERGE)) |
+               PGAIOIP_REAPED;
+
+           Assert(dlist_is_member(&my_aio->reaped, &last->io_node));
+           dlist_insert_after(&last->io_node, &cur->io_node);
+           extracted++;
+       }
+       else
+       {
+           cur->flags &= ~PGAIOIP_MERGE;
+       }
+
+       last = cur;
+       cur = next;
+   }
+
+   return extracted;
+}
+
+static void
+pgaio_uncombine(void)
+{
+   dlist_mutable_iter iter;
+
+   /* "unmerge" merged IOs, so they can be treated uniformly */
+   dlist_foreach_modify(iter, &my_aio->reaped)
+   {
+       PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, iter.cur);
+       uint32 extracted = 1;
+
+       if (io->flags & PGAIOIP_MERGE)
+           extracted += pgaio_uncombine_one(io);
+
+       pg_atomic_fetch_sub_u32(&aio_ctl->backend_state[io->owner_id].inflight_count, 1);
+   }
+}
+
+static void  __attribute__((noinline))
+pgaio_complete_ios(bool in_error)
+{
+   int pending_count_before = my_aio->pending_count;
+
+   Assert(!LWLockHeldByMe(SharedAIOCtlLock));
+
+   /* call all callbacks, without holding lock */
+   while (!dlist_is_empty(&my_aio->reaped))
+   {
+       dlist_node *node = dlist_head_node(&my_aio->reaped);
+       PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+       Assert(dlist_is_member(&my_aio->reaped, &io->io_node));
+
+       Assert(node != NULL);
+
+       if (!(io->flags & PGAIOIP_SHARED_CALLBACK_CALLED))
+       {
+           PgAioCompletedCB cb;
+           bool finished;
+
+           /*
+            * Set flag before calling callback, otherwise we could easily end
+            * up looping forever.
+            */
+           *(volatile PgAioIPFlags*) &io->flags |= PGAIOIP_SHARED_CALLBACK_CALLED;
+
+           cb = completion_callbacks[io->type];
+           finished = cb(io);
+
+           dlist_delete_from(&my_aio->reaped, node);
+
+           if (finished)
+           {
+               dlist_push_tail(&local_recycle_requests, &io->io_node);
+           }
+           else
+           {
+               Assert((*(volatile PgAioIPFlags*) &io->flags) & (PGAIOIP_SOFT_FAILURE | PGAIOIP_HARD_FAILURE));
+
+               LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+               *(volatile PgAioIPFlags*) &io->flags =
+                   (io->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+                   PGAIOIP_DONE |
+                   PGAIOIP_SHARED_FAILED;
+               dlist_push_tail(&aio_ctl->reaped_uncompleted, &io->io_node);
+               LWLockRelease(SharedAIOCtlLock);
+
+               /* signal state change */
+               if (IsUnderPostmaster)
+                   ConditionVariableBroadcast(&io->cv);
+           }
+       }
+       else
+       {
+           Assert(in_error);
+
+           dlist_delete_from(&my_aio->reaped, node);
+
+           LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+           *(volatile PgAioIPFlags*) &io->flags =
+               (io->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+               PGAIOIP_DONE |
+               PGAIOIP_HARD_FAILURE |
+               PGAIOIP_SHARED_FAILED;
+           dlist_push_tail(&aio_ctl->reaped_uncompleted, &io->io_node);
+           LWLockRelease(SharedAIOCtlLock);
+       }
+   }
+
+   /* if any IOs weren't fully done, re-submit them */
+   if (pending_count_before != my_aio->pending_count)
+       pgaio_submit_pending(false);
+
+   /*
+    * Next, under lock, process all the still pending requests. This entails
+    * releasing the "system" reference on the IO and checking which callbacks
+    * need to be called.
+    */
+   START_CRIT_SECTION();
+
+   while (!dlist_is_empty(&local_recycle_requests))
+   {
+       dlist_mutable_iter iter;
+       PgAioInProgress* signal_ios[32];
+       int to_signal = 0;
+
+       LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+       dlist_foreach_modify(iter, &local_recycle_requests)
+       {
+           PgAioInProgress *cur = dlist_container(PgAioInProgress, io_node, iter.cur);
+
+           dlist_delete_from(&local_recycle_requests, iter.cur);
+           signal_ios[to_signal++] = cur;
+
+           Assert(cur->system_referenced);
+           Assert(cur->flags & PGAIOIP_REAPED);
+           Assert(!(cur->flags & PGAIOIP_DONE));
+           Assert(!(cur->flags & PGAIOIP_INFLIGHT));
+           Assert(!(cur->flags & PGAIOIP_MERGE));
+           Assert(!(cur->flags & (PGAIOIP_SHARED_FAILED)));
+           Assert(!(cur->flags & (PGAIOIP_SOFT_FAILURE)));
+           Assert(cur->merge_with == NULL);
+
+           if (cur->user_referenced)
+           {
+               cur->system_referenced = false;
+
+               if (cur->owner_id != my_aio_id)
+               {
+                   PgAioPerBackend *other = &aio_ctl->backend_state[cur->owner_id];
+
+                   SpinLockAcquire(&other->foreign_completed_lock);
+
+                   dlist_push_tail(&other->foreign_completed, &cur->io_node);
+                   other->foreign_completed_count++;
+                   other->foreign_completed_total_count++;
+
+                   pg_write_barrier();
+
+                   *(volatile PgAioIPFlags*) &cur->flags =
+                       (cur->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+                       PGAIOIP_DONE |
+                       PGAIOIP_FOREIGN_DONE;
+
+                   SpinLockRelease(&other->foreign_completed_lock);
+               }
+               else
+               {
+                   *(volatile PgAioIPFlags*) &cur->flags =
+                       (cur->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+                       PGAIOIP_DONE;
+
+                   dlist_push_tail(&my_aio->local_completed, &cur->io_node);
+                   my_aio->local_completed_count++;
+               }
+           }
+           else
+           {
+               PgAioPerBackend *other = &aio_ctl->backend_state[cur->owner_id];
+
+#ifdef PGAIO_VERBOSE
+               ereport(DEBUG4,
+                       errmsg("removing aio %zu from issued_abandoned complete_ios",
+                              cur - aio_ctl->in_progress_io),
+                       errhidecontext(1),
+                       errhidestmt(1));
+#endif
+
+               dlist_delete_from(&other->issued_abandoned, &cur->owner_node);
+               Assert(other->issued_abandoned_count > 0);
+               other->issued_abandoned_count--;
+
+               cur->generation++;
+               pg_write_barrier();
+
+               cur->flags = PGAIOIP_UNUSED;
+
+               if (cur->bb)
+               {
+                   pgaio_bounce_buffer_release_internal(cur->bb,
+                                                        /* holding_lock = */ true,
+                                                        /* release_resowner = */ false);
+                   cur->bb = NULL;
+               }
+
+               cur->type = 0;
+               cur->owner_id = INVALID_PGPROCNO;
+               cur->result = 0;
+               cur->system_referenced = true;
+               cur->on_completion_local = NULL;
+
+               dlist_push_head(&aio_ctl->unused_ios, &cur->owner_node);
+               aio_ctl->used_count--;
+           }
+
+           if (to_signal >= lengthof(signal_ios))
+               break;
+       }
+       LWLockRelease(SharedAIOCtlLock);
+
+       if (IsUnderPostmaster)
+       {
+           for (int i = 0; i < to_signal; i++)
+           {
+               ConditionVariableBroadcast(&signal_ios[i]->cv);
+           }
+       }
+   }
+
+   END_CRIT_SECTION();
+}
+
+static void
+pgaio_io_call_local_callback(PgAioInProgress *io, bool in_error)
+{
+   Assert(!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED));
+   Assert(io->user_referenced);
+
+   dlist_delete_from(&my_aio->issued, &io->owner_node);
+   my_aio->issued_count--;
+   dlist_push_tail(&my_aio->outstanding, &io->owner_node);
+   my_aio->outstanding_count++;
+
+   io->flags |= PGAIOIP_LOCAL_CALLBACK_CALLED;
+
+   if (!io->on_completion_local)
+       return;
+
+   if (!in_error)
+       io->on_completion_local->callback(io->on_completion_local, io);
+
+}
+
+/*
+ * Call all pending local callbacks.
+ */
+static void
+pgaio_call_local_callbacks(bool in_error)
+{
+   if (my_aio->local_completed_count != 0 &&
+       CritSectionCount == 0)
+   {
+       /* FIXME: this isn't safe against errors */
+       static int local_callback_depth = 0;
+
+       if (local_callback_depth == 0)
+       {
+           local_callback_depth++;
+
+           while (!dlist_is_empty(&my_aio->local_completed))
+           {
+               dlist_node *node = dlist_pop_head_node(&my_aio->local_completed);
+               PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+               Assert(my_aio->local_completed_count > 0);
+               my_aio->local_completed_count--;
+
+               pgaio_io_call_local_callback(io, in_error);
+           }
+
+           local_callback_depth--;
+       }
+   }
+}
+
+/*
+ * Receive completions in ring.
+ */
+static int  __attribute__((noinline))
+pgaio_drain(PgAioContext *context, bool in_error, bool call_local)
+{
+   int ndrained = 0;
+
+   ndrained = pgaio_uring_drain(context);
+
+   /*
+    * Transfer all the foreign completions into the local queue.
+    */
+   if (my_aio->foreign_completed_count != 0)
+   {
+       SpinLockAcquire(&my_aio->foreign_completed_lock);
+
+       while (!dlist_is_empty(&my_aio->foreign_completed))
+       {
+           dlist_node *node = dlist_pop_head_node(&my_aio->foreign_completed);
+           PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+           Assert(!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED));
+
+           dlist_push_tail(&my_aio->local_completed, &io->io_node);
+           io->flags &= ~PGAIOIP_FOREIGN_DONE;
+           my_aio->foreign_completed_count--;
+           my_aio->local_completed_count++;
+       }
+       SpinLockRelease(&my_aio->foreign_completed_lock);
+   }
+
+   /*
+    * Call all pending local callbacks.
+    */
+   if (call_local && CritSectionCount == 0)
+       pgaio_call_local_callbacks(in_error);
+
+   return ndrained;
+}
+
+static bool
+pgaio_can_be_combined(PgAioInProgress *last, PgAioInProgress *cur)
+{
+   if (last->type != cur->type)
+       return false;
+
+   if (last->flags & PGAIOIP_RETRY ||
+       cur->flags & PGAIOIP_RETRY)
+       return false;
+
+   switch (last->type)
+   {
+       case PGAIO_INVALID:
+           elog(ERROR, "unexpected");
+           break;
+
+       case PGAIO_READ_BUFFER:
+           if (last->d.read_buffer.fd != cur->d.read_buffer.fd)
+               return false;
+           if ((last->d.read_buffer.offset + last->d.read_buffer.nbytes) != cur->d.read_buffer.offset)
+               return false;
+           if (last->d.read_buffer.mode != cur->d.read_buffer.mode)
+               return false;
+           if (last->d.read_buffer.already_done != 0 || cur->d.read_buffer.already_done != 0)
+               return false;
+
+           return true;
+
+       case PGAIO_NOP:
+       case PGAIO_FLUSH_RANGE:
+       case PGAIO_FSYNC:
+       case PGAIO_FSYNC_WAL:
+           return false;
+
+       case PGAIO_WRITE_BUFFER:
+           if (last->d.write_buffer.fd != cur->d.write_buffer.fd)
+               return false;
+           if ((last->d.write_buffer.offset + last->d.write_buffer.nbytes) != cur->d.write_buffer.offset)
+               return false;
+           if (last->d.write_buffer.already_done != 0 || cur->d.write_buffer.already_done != 0)
+               return false;
+           return true;
+
+       case PGAIO_WRITE_WAL:
+           if (last->d.write_wal.fd != cur->d.write_wal.fd)
+               return false;
+           if ((last->d.write_wal.offset + last->d.write_wal.nbytes) != cur->d.write_wal.offset)
+               return false;
+           if (last->d.write_wal.already_done != 0 || cur->d.write_wal.already_done != 0)
+               return false;
+           if (last->d.write_wal.no_reorder || cur->d.write_wal.no_reorder)
+               return false;
+           return true;
+
+       case PGAIO_WRITE_GENERIC:
+           if (last->d.write_generic.fd != cur->d.write_generic.fd)
+               return false;
+           if ((last->d.write_generic.offset + last->d.write_generic.nbytes) != cur->d.write_generic.offset)
+               return false;
+           if (last->d.write_generic.already_done != 0 || cur->d.write_generic.already_done != 0)
+               return false;
+           if (last->d.write_generic.no_reorder || cur->d.write_generic.no_reorder)
+               return false;
+           return true;
+   }
+
+   pg_unreachable();
+}
+
+static void
+pgaio_io_merge(PgAioInProgress *into, PgAioInProgress *tomerge)
+{
+   ereport(DEBUG3,
+           errmsg("merging %zu to %zu",
+                  tomerge - aio_ctl->in_progress_io,
+                  into - aio_ctl->in_progress_io),
+           errhidestmt(true),
+           errhidecontext(true));
+
+   into->merge_with = tomerge;
+   into->flags |= PGAIOIP_MERGE;
+}
+
+static void
+pgaio_combine_pending(void)
+{
+   dlist_iter iter;
+   PgAioInProgress *last = NULL;
+   int combined = 1;
+
+   Assert(my_aio->pending_count > 1);
+
+   dlist_foreach(iter, &my_aio->pending)
+   {
+       PgAioInProgress *cur = dlist_container(PgAioInProgress, io_node, iter.cur);
+
+       /* can happen when failing partway through io submission */
+       if (cur->merge_with)
+       {
+           elog(DEBUG1, "already merged request (%zu), giving up on merging",
+                cur - aio_ctl->in_progress_io);
+           return;
+       }
+
+       Assert(cur->merge_with == NULL);
+       Assert(!(cur->flags & PGAIOIP_MERGE));
+
+       if (last == NULL)
+       {
+           last = cur;
+           continue;
+       }
+
+       if (pgaio_can_be_combined(last, cur))
+       {
+           combined++;
+
+           pgaio_io_merge(last, cur);
+       }
+       else
+       {
+           combined = 1;
+       }
+
+       if (combined >= PGAIO_MAX_COMBINE)
+       {
+           ereport(DEBUG3,
+                   errmsg("max combine at %d", combined),
+                   errhidestmt(true),
+                   errhidecontext(true));
+           last = NULL;
+           combined = 1;
+       }
+       else
+           last = cur;
+   }
+}
+
+void  __attribute__((noinline))
+pgaio_submit_pending(bool drain)
+{
+   int total_submitted = 0;
+   uint32 orig_total;
+
+   if (!aio_ctl || !my_aio)
+       return;
+
+   if (my_aio->pending_count == 0)
+   {
+       Assert(dlist_is_empty(&my_aio->pending));
+       return;
+   }
+
+   HOLD_INTERRUPTS();
+
+   orig_total = my_aio->pending_count;
+
+#define COMBINE_ENABLED
+
+#ifdef COMBINE_ENABLED
+#if 0
+   ereport(LOG, errmsg("before combine"),
+           errhidestmt(true),
+           errhidecontext(true));
+   pgaio_print_list(&my_aio->pending, NULL, offsetof(PgAioInProgress, io_node));
+#endif
+   if (my_aio->pending_count > 1 && PGAIO_MAX_COMBINE > 1)
+       pgaio_combine_pending();
+
+#if 0
+   ereport(LOG, errmsg("after combine"),
+           errhidestmt(true),
+           errhidecontext(true));
+   pgaio_print_list(&my_aio->pending, NULL, offsetof(PgAioInProgress, io_node));
+#endif
+#endif /* COMBINE_ENABLED */
+
+   /*
+    * Loop until all pending IOs are submitted. Throttle max in-flight before
+    * calling into the IO implementation specific routine, so this code can
+    * be shared.
+    */
+   while (!dlist_is_empty(&my_aio->pending))
+   {
+       int max_submit;
+       int did_submit;
+
+       Assert(my_aio->pending_count > 0);
+       pgaio_apply_backend_limit();
+
+       Assert(my_aio->pending_count > 0);
+       if (my_aio->pending_count == 0)
+           break;
+
+       max_submit = Min(my_aio->pending_count, PGAIO_SUBMIT_BATCH_SIZE);
+       max_submit = Min(max_submit, io_max_concurrency - pg_atomic_read_u32(&my_aio->inflight_count));
+       Assert(max_submit > 0);
+
+       START_CRIT_SECTION();
+       did_submit = pgaio_uring_submit(max_submit, drain);
+       total_submitted += did_submit;
+       Assert(did_submit > 0 && did_submit <= max_submit);
+       END_CRIT_SECTION();
+   }
+
+   my_aio->executed_total_count += orig_total;
+   my_aio->issued_total_count += total_submitted;
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("submitted %d (orig %d)", total_submitted, orig_total),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   RESUME_INTERRUPTS();
+
+   if (drain)
+       pgaio_call_local_callbacks(/* in_error = */ false);
+}
+
+static void
+pgaio_io_prepare_submit(PgAioInProgress *io, uint32 ring)
+{
+   PgAioInProgress *cur;
+
+   cur = io;
+
+   while (cur)
+   {
+       Assert(cur->flags & PGAIOIP_PENDING);
+
+       cur->ring = ring;
+
+       pg_write_barrier();
+
+       *(volatile PgAioIPFlags*) &cur->flags =
+           (cur->flags & ~PGAIOIP_PENDING) | PGAIOIP_INFLIGHT;
+
+       dlist_delete_from(&my_aio->pending, &cur->io_node);
+       my_aio->pending_count--;
+
+       if (cur->flags & PGAIOIP_RETRY)
+       {
+           /* XXX: more error checks */
+       }
+       else if (cur->user_referenced)
+       {
+           Assert(my_aio_id == cur->owner_id);
+           Assert(my_aio->outstanding_count > 0);
+           dlist_delete_from(&my_aio->outstanding, &cur->owner_node);
+           my_aio->outstanding_count--;
+
+           dlist_push_tail(&my_aio->issued, &cur->owner_node);
+           my_aio->issued_count++;
+       }
+       else
+       {
+#ifdef PGAIO_VERBOSE
+           ereport(DEBUG4,
+                   errmsg("putting aio %zu onto issued_abandoned during submit",
+                          cur - aio_ctl->in_progress_io),
+                   errhidecontext(1),
+                   errhidestmt(1));
+#endif
+
+           LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+           dlist_push_tail(&my_aio->issued_abandoned, &cur->owner_node);
+           my_aio->issued_abandoned_count++;
+           LWLockRelease(SharedAIOCtlLock);
+       }
+
+       ereport(DEBUG5,
+               errmsg("readied %zu/%llu for submit",
+                      cur - aio_ctl->in_progress_io,
+                      (unsigned long long ) cur->generation),
+               errhidecontext(1),
+               errhidestmt(1));
+
+       cur = cur->merge_with;
+   }
+}
+
+static void
+pgaio_apply_backend_limit(void)
+{
+   uint32 current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+
+   while (current_inflight >= io_max_concurrency)
+   {
+       PgAioInProgress *io;
+
+       /*
+        * XXX: Should we be a bit fairer and check the "oldest" in-flight IO
+        * between issued and issued_abandoned?
+        */
+
+       if (my_aio->issued_count > 0)
+       {
+           dlist_iter iter;
+
+           Assert(!dlist_is_empty(&my_aio->issued));
+
+           dlist_foreach(iter, &my_aio->issued)
+           {
+               io = dlist_container(PgAioInProgress, owner_node, iter.cur);
+
+               if (io->flags & PGAIOIP_INFLIGHT)
+               {
+                   PgAioIoRef ref;
+
+                   ereport(DEBUG2,
+                           errmsg("applying per-backend limit to issued IO %zu/%llu (current %d in %d, target %d)",
+                                  io - aio_ctl->in_progress_io,
+                                  (unsigned long long) io->generation,
+                                  my_aio->issued_count + my_aio->issued_abandoned_count,
+                                  current_inflight,
+                                  io_max_concurrency),
+                           errhidestmt(true),
+                           errhidecontext(true));
+
+                   pgaio_io_ref(io, &ref);
+                   pgaio_io_wait_ref(&ref, /* call_local = */ false);
+                   current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+                   break;
+               }
+           }
+       }
+
+       if (current_inflight < io_max_concurrency)
+           break;
+
+       if (my_aio->issued_abandoned_count > 0)
+       {
+           dlist_iter iter;
+           PgAioIoRef ref;
+
+           io = NULL;
+
+           LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+           dlist_foreach(iter, &my_aio->issued_abandoned)
+           {
+               io = dlist_container(PgAioInProgress, owner_node, iter.cur);
+
+               if (io->flags & PGAIOIP_INFLIGHT)
+               {
+                   pgaio_io_ref_internal(io, &ref);
+                   break;
+               }
+               else
+                   io = NULL;
+           }
+           LWLockRelease(SharedAIOCtlLock);
+
+           if (io == NULL)
+               continue;
+
+           ereport(DEBUG2,
+                   errmsg("applying per-backend limit to issued_abandoned IO %zu/%llu (current %d in %d, target %d)",
+                          io - aio_ctl->in_progress_io,
+                          (unsigned long long) io->generation,
+                          my_aio->issued_count + my_aio->issued_abandoned_count,
+                          current_inflight,
+                          io_max_concurrency),
+                   errhidestmt(true),
+                   errhidecontext(true));
+
+           pgaio_io_wait_ref(&ref, false);
+       }
+
+       current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+   }
+}
+
+void
+pgaio_io_wait_ref(PgAioIoRef *ref, bool call_local)
+{
+   uint64 ref_generation;
+   PgAioInProgress *io;
+   uint32 done_flags = PGAIOIP_DONE;
+   PgAioIPFlags flags;
+   bool am_owner;
+
+   Assert(ref->aio_index < max_aio_in_progress);
+
+   io = &aio_ctl->in_progress_io[ref->aio_index];
+   ref_generation = ((uint64) ref->generation_upper) << 32 |
+       ref->generation_lower;
+
+   Assert(ref_generation != 0);
+
+   am_owner = io->owner_id == my_aio_id;
+   flags = io->flags;
+   pg_read_barrier();
+
+   if (io->generation != ref_generation)
+       return;
+
+   if (am_owner && (flags & PGAIOIP_PENDING))
+       pgaio_submit_pending(false);
+
+   Assert(!(flags & (PGAIOIP_UNUSED)));
+
+   while (true)
+   {
+       PgAioContext *context;
+
+       flags = io->flags;
+       context = &aio_ctl->contexts[io->ring];
+       pg_read_barrier();
+
+       if (io->generation != ref_generation)
+           return;
+
+       if (flags & done_flags)
+           goto wait_ref_out;
+
+       Assert(!(flags & (PGAIOIP_UNUSED)));
+
+       if (flags & PGAIOIP_INFLIGHT)
+       {
+           pgaio_drain(context, false, call_local);
+
+           flags = io->flags;
+           context = &aio_ctl->contexts[io->ring];
+           pg_read_barrier();
+
+           if (io->generation != ref_generation)
+               return;
+
+           if (flags & done_flags)
+               goto wait_ref_out;
+       }
+
+       if (my_aio->pending_count > 0 && call_local)
+       {
+           /* FIXME: we should call this in a larger number of cases */
+
+           /*
+            * If we otherwise would have to sleep submit all pending
+            * requests, to avoid others having to wait for us to submit
+            * them. Don't want to do so when not needing to sleep, as
+            * submitting IOs in smaller increments can be less efficient.
+            */
+           pgaio_submit_pending(false);
+       }
+       else if (flags & PGAIOIP_INFLIGHT)
+       {
+           /* note that this is allowed to spuriously return */
+           pgaio_uring_wait_one(context, io, ref_generation, WAIT_EVENT_AIO_IO_COMPLETE_ANY);
+       }
+       else
+       {
+           /* shouldn't be reachable without concurrency */
+           Assert(IsUnderPostmaster);
+
+           /* ensure we're going to get woken up */
+           if (IsUnderPostmaster)
+               ConditionVariablePrepareToSleep(&io->cv);
+
+           flags = io->flags;
+           pg_read_barrier();
+           if (io->generation == ref_generation && !(flags & done_flags))
+               ConditionVariableSleep(&io->cv, WAIT_EVENT_AIO_IO_COMPLETE_ONE);
+
+           if (IsUnderPostmaster)
+               ConditionVariableCancelSleepEx(true);
+       }
+   }
+
+wait_ref_out:
+
+   flags = io->flags;
+   pg_read_barrier();
+   if (io->generation != ref_generation)
+       return;
+
+   Assert(flags & PGAIOIP_DONE);
+
+   if (unlikely(flags & (PGAIOIP_SOFT_FAILURE | PGAIOIP_HARD_FAILURE)))
+   {
+       /* can retry soft failures, but not hard ones */
+       /* FIXME: limit number of soft retries */
+       if (flags & PGAIOIP_SOFT_FAILURE)
+       {
+           pgaio_io_retry(io);
+           pgaio_io_wait_ref(ref, call_local);
+       }
+       else
+       {
+           pgaio_io_print(io, NULL);
+           elog(WARNING, "request %zd failed permanently",
+                io - aio_ctl->in_progress_io);
+       }
+
+       return;
+   }
+
+   if (am_owner && call_local && !(flags & PGAIOIP_LOCAL_CALLBACK_CALLED))
+   {
+       if (flags & PGAIOIP_FOREIGN_DONE)
+       {
+           SpinLockAcquire(&my_aio->foreign_completed_lock);
+           dlist_delete_from(&my_aio->foreign_completed, &io->io_node);
+           io->flags &= ~PGAIOIP_FOREIGN_DONE;
+           my_aio->foreign_completed_count--;
+           SpinLockRelease(&my_aio->foreign_completed_lock);
+       }
+       else
+       {
+           Assert(my_aio->local_completed_count > 0);
+           dlist_delete_from(&my_aio->local_completed, &io->io_node);
+           my_aio->local_completed_count--;