Fix dynahash.c to suppress hash bucket splits while a hash_seq_search() scan

author Tom Lane <[email protected]>

Thu, 26 Apr 2007 23:25:09 +0000 (23:25 +0000)

committer Tom Lane <[email protected]>

Thu, 26 Apr 2007 23:25:09 +0000 (23:25 +0000)
author Tom Lane <[email protected]>
Thu, 26 Apr 2007 23:25:09 +0000 (23:25 +0000)
committer Tom Lane <[email protected]>
Thu, 26 Apr 2007 23:25:09 +0000 (23:25 +0000)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index 9221a2bd84a7a8aeaccdc9792febcfdb02a2b481..fe212c457aa20c2d031cae371254994a1692fe8e 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1578,6 +1578,7 @@ CommitTransaction(void)
         AtEOXact_Namespace(true);
         /* smgrcommit already done */
         AtEOXact_Files();
+       AtEOXact_HashTables(true);
         pgstat_count_xact_commit();
  
         CurrentResourceOwner = NULL;
@@ -1792,6 +1793,7 @@ PrepareTransaction(void)
         AtEOXact_Namespace(true);
         /* smgrcommit already done */
         AtEOXact_Files();
+       AtEOXact_HashTables(true);
  
         CurrentResourceOwner = NULL;
         ResourceOwnerDelete(TopTransactionResourceOwner);
@@ -1940,6 +1942,7 @@ AbortTransaction(void)
         AtEOXact_Namespace(false);
         smgrabort();
         AtEOXact_Files();
+       AtEOXact_HashTables(false);
         pgstat_count_xact_rollback();
  
         /*
@@ -3651,6 +3654,7 @@ CommitSubTransaction(void)
                                                   s->parent->subTransactionId);
         AtEOSubXact_Files(true, s->subTransactionId,
                                           s->parent->subTransactionId);
+       AtEOSubXact_HashTables(true, s->nestingLevel);
  
         /*
          * We need to restore the upper transaction's read-only state, in case the
@@ -3760,6 +3764,7 @@ AbortSubTransaction(void)
                                                           s->parent->subTransactionId);
                 AtEOSubXact_Files(false, s->subTransactionId,
                                                   s->parent->subTransactionId);
+               AtEOSubXact_HashTables(false, s->nestingLevel);
         }
  
         /*
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c

index ff0e261e8faa5198712f4e976718d0be5b4e8853..dff692a6150e49fa9685426ca8cdc8a04f3606e0 100644 (file)
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -616,7 +616,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
         TupleHashIterator hashiter;
         TupleHashEntry entry;
  
-       ResetTupleHashIterator(hashtable, &hashiter);
+       InitTupleHashIterator(hashtable, &hashiter);
         while ((entry = ScanTupleHashTable(&hashiter)) != NULL)
         {
                 ExecStoreTuple(entry->firstTuple, hashtable->tableslot,
@@ -625,8 +625,12 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
                                                            numCols, keyColIdx,
                                                            hashtable->eqfunctions,
                                                            hashtable->tempcxt))
+               {
+                       TermTupleHashIterator(&hashiter);
                         return true;
+               }
         }
+       /* No TermTupleHashIterator call needed here */
         return false;
  }
  
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c

new file mode 100644 (file)

index 0000000..0c38767
--- /dev/null
+++ b/src/backend/nodes/tidbitmap.c
@@ -0,0 +1,938 @@
+/*-------------------------------------------------------------------------
+ *
+ * tidbitmap.c
+ *       PostgreSQL tuple-id (TID) bitmap package
+ *
+ * This module provides bitmap data structures that are spiritually
+ * similar to Bitmapsets, but are specially adapted to store sets of
+ * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
+ * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
+ * Also, since we wish to be able to store very large tuple sets in
+ * memory with this data structure, we support "lossy" storage, in which
+ * we no longer remember individual tuple offsets on a page but only the
+ * fact that a particular page needs to be visited.
+ *
+ * The "lossy" storage uses one bit per disk page, so at the standard 8K
+ * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
+ * of memory.  People pushing around tables of that size should have a
+ * couple of Mb to spare, so we don't worry about providing a second level
+ * of lossiness.  In theory we could fall back to page ranges at some
+ * point, but for now that seems useless complexity.
+ *
+ *
+ * Copyright (c) 2003-2005, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/htup.h"
+#include "nodes/tidbitmap.h"
+#include "utils/hsearch.h"
+
+
+/*
+ * The maximum number of tuples per page is not large (typically 256 with
+ * 8K pages, or 1024 with 32K pages).  So there's not much point in making
+ * the per-page bitmaps variable size. We just legislate that the size
+ * is this:
+ */
+#define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
+
+/*
+ * When we have to switch over to lossy storage, we use a data structure
+ * with one bit per page, where all pages having the same number DIV
+ * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
+ * and has the bit set for a given page, there must not be a per-page entry
+ * for that page in the page table.
+ *
+ * We actually store both exact pages and lossy chunks in the same hash
+ * table, using identical data structures.     (This is because dynahash.c's
+ * memory management doesn't allow space to be transferred easily from one
+ * hashtable to another.)  Therefore it's best if PAGES_PER_CHUNK is the
+ * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
+ * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
+ * remainder operations.  So, define it like this:
+ */
+#define PAGES_PER_CHUNK  (BLCKSZ / 32)
+
+/* The bitmap unit size can be adjusted by changing these declarations: */
+#define BITS_PER_BITMAPWORD 32
+typedef uint32 bitmapword;             /* must be an unsigned type */
+
+#define WORDNUM(x)     ((x) / BITS_PER_BITMAPWORD)
+#define BITNUM(x)      ((x) % BITS_PER_BITMAPWORD)
+
+/* number of active words for an exact page: */
+#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
+/* number of active words for a lossy chunk: */
+#define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
+
+/*
+ * The hashtable entries are represented by this data structure.  For
+ * an exact page, blockno is the page number and bit k of the bitmap
+ * represents tuple offset k+1.  For a lossy chunk, blockno is the first
+ * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
+ * bit k represents page blockno+k.  Note that it is not possible to
+ * have exact storage for the first page of a chunk if we are using
+ * lossy storage for any page in the chunk's range, since the same
+ * hashtable entry has to serve both purposes.
+ */
+typedef struct PagetableEntry
+{
+       BlockNumber blockno;            /* page number (hashtable key) */
+       bool            ischunk;                /* T = lossy storage, F = exact */
+       bitmapword      words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
+} PagetableEntry;
+
+/*
+ * dynahash.c is optimized for relatively large, long-lived hash tables.
+ * This is not ideal for TIDBitMap, particularly when we are using a bitmap
+ * scan on the inside of a nestloop join: a bitmap may well live only long
+ * enough to accumulate one entry in such cases.  We therefore avoid creating
+ * an actual hashtable until we need two pagetable entries.  When just one
+ * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
+ * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
+ * to zero or one page again.  So, status can be TBM_HASH even when nentries
+ * is zero or one.)
+ */
+typedef enum
+{
+       TBM_EMPTY,                                      /* no hashtable, nentries == 0 */
+       TBM_ONE_PAGE,                           /* entry1 contains the single entry */
+       TBM_HASH                                        /* pagetable is valid, entry1 is not */
+} TBMStatus;
+
+/*
+ * Here is the representation for a whole TIDBitMap:
+ */
+struct TIDBitmap
+{
+       NodeTag         type;                   /* to make it a valid Node */
+       MemoryContext mcxt;                     /* memory context containing me */
+       TBMStatus       status;                 /* see codes above */
+       HTAB       *pagetable;          /* hash table of PagetableEntry's */
+       int                     nentries;               /* number of entries in pagetable */
+       int                     maxentries;             /* limit on same to meet maxbytes */
+       int                     npages;                 /* number of exact entries in pagetable */
+       int                     nchunks;                /* number of lossy entries in pagetable */
+       bool            iterating;              /* tbm_begin_iterate called? */
+       PagetableEntry entry1;          /* used when status == TBM_ONE_PAGE */
+       /* the remaining fields are used while producing sorted output: */
+       PagetableEntry **spages;        /* sorted exact-page list, or NULL */
+       PagetableEntry **schunks;       /* sorted lossy-chunk list, or NULL */
+       int                     spageptr;               /* next spages index */
+       int                     schunkptr;              /* next schunks index */
+       int                     schunkbit;              /* next bit to check in current schunk */
+       TBMIterateResult output;        /* MUST BE LAST (because variable-size) */
+};
+
+
+/* Local function prototypes */
+static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
+static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
+                                  const TIDBitmap *b);
+static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
+                                  BlockNumber pageno);
+static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
+static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
+static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
+static void tbm_lossify(TIDBitmap *tbm);
+static int     tbm_comparator(const void *left, const void *right);
+
+
+/*
+ * tbm_create - create an initially-empty bitmap
+ *
+ * The bitmap will live in the memory context that is CurrentMemoryContext
+ * at the time of this call.  It will be limited to (approximately) maxbytes
+ * total memory consumption.
+ */
+TIDBitmap *
+tbm_create(long maxbytes)
+{
+       TIDBitmap  *tbm;
+       long            nbuckets;
+
+       /*
+        * Create the TIDBitmap struct, with enough trailing space to serve the
+        * needs of the TBMIterateResult sub-struct.
+        */
+       tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) +
+                                                          MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+       /* Zero all the fixed fields */
+       MemSetAligned(tbm, 0, sizeof(TIDBitmap));
+
+       tbm->type = T_TIDBitmap;        /* Set NodeTag */
+       tbm->mcxt = CurrentMemoryContext;
+       tbm->status = TBM_EMPTY;
+
+       /*
+        * Estimate number of hashtable entries we can have within maxbytes. This
+        * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
+        * pointer per hash entry, which is crude but good enough for our purpose.
+        * Also count an extra Pointer per entry for the arrays created during
+        * iteration readout.
+        */
+       nbuckets = maxbytes /
+               (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
+                + sizeof(Pointer) + sizeof(Pointer));
+       nbuckets = Min(nbuckets, INT_MAX - 1);          /* safety limit */
+       nbuckets = Max(nbuckets, 16);           /* sanity limit */
+       tbm->maxentries = (int) nbuckets;
+
+       return tbm;
+}
+
+/*
+ * Actually create the hashtable.  Since this is a moderately expensive
+ * proposition, we don't do it until we have to.
+ */
+static void
+tbm_create_pagetable(TIDBitmap *tbm)
+{
+       HASHCTL         hash_ctl;
+
+       Assert(tbm->status != TBM_HASH);
+       Assert(tbm->pagetable == NULL);
+
+       /* Create the hashtable proper */
+       MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(BlockNumber);
+       hash_ctl.entrysize = sizeof(PagetableEntry);
+       hash_ctl.hash = tag_hash;
+       hash_ctl.hcxt = tbm->mcxt;
+       tbm->pagetable = hash_create("TIDBitmap",
+                                                                128,   /* start small and extend */
+                                                                &hash_ctl,
+                                                                HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+       /* If entry1 is valid, push it into the hashtable */
+       if (tbm->status == TBM_ONE_PAGE)
+       {
+               PagetableEntry *page;
+               bool            found;
+
+               page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                         (void *) &tbm->entry1.blockno,
+                                                                                         HASH_ENTER, &found);
+               Assert(!found);
+               memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
+       }
+
+       tbm->status = TBM_HASH;
+}
+
+/*
+ * tbm_free - free a TIDBitmap
+ */
+void
+tbm_free(TIDBitmap *tbm)
+{
+       if (tbm->pagetable)
+               hash_destroy(tbm->pagetable);
+       if (tbm->spages)
+               pfree(tbm->spages);
+       if (tbm->schunks)
+               pfree(tbm->schunks);
+       pfree(tbm);
+}
+
+/*
+ * tbm_add_tuples - add some tuple IDs to a TIDBitmap
+ */
+void
+tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids)
+{
+       int                     i;
+
+       Assert(!tbm->iterating);
+       for (i = 0; i < ntids; i++)
+       {
+               BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
+               OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
+               PagetableEntry *page;
+               int                     wordnum,
+                                       bitnum;
+
+               /* safety check to ensure we don't overrun bit array bounds */
+               if (off < 1 || off > MAX_TUPLES_PER_PAGE)
+                       elog(ERROR, "tuple offset out of range: %u", off);
+
+               if (tbm_page_is_lossy(tbm, blk))
+                       continue;                       /* whole page is already marked */
+
+               page = tbm_get_pageentry(tbm, blk);
+
+               if (page->ischunk)
+               {
+                       /* The page is a lossy chunk header, set bit for itself */
+                       wordnum = bitnum = 0;
+               }
+               else
+               {
+                       /* Page is exact, so set bit for individual tuple */
+                       wordnum = WORDNUM(off - 1);
+                       bitnum = BITNUM(off - 1);
+               }
+               page->words[wordnum] |= ((bitmapword) 1 << bitnum);
+
+               if (tbm->nentries > tbm->maxentries)
+                       tbm_lossify(tbm);
+       }
+}
+
+/*
+ * tbm_union - set union
+ *
+ * a is modified in-place, b is not changed
+ */
+void
+tbm_union(TIDBitmap *a, const TIDBitmap *b)
+{
+       Assert(!a->iterating);
+       /* Nothing to do if b is empty */
+       if (b->nentries == 0)
+               return;
+       /* Scan through chunks and pages in b, merge into a */
+       if (b->status == TBM_ONE_PAGE)
+               tbm_union_page(a, &b->entry1);
+       else
+       {
+               HASH_SEQ_STATUS status;
+               PagetableEntry *bpage;
+
+               Assert(b->status == TBM_HASH);
+               hash_seq_init(&status, b->pagetable);
+               while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+                       tbm_union_page(a, bpage);
+       }
+}
+
+/* Process one page of b during a union op */
+static void
+tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
+{
+       PagetableEntry *apage;
+       int                     wordnum;
+
+       if (bpage->ischunk)
+       {
+               /* Scan b's chunk, mark each indicated page lossy in a */
+               for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+               {
+                       bitmapword      w = bpage->words[wordnum];
+
+                       if (w != 0)
+                       {
+                               BlockNumber pg;
+
+                               pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
+                               while (w != 0)
+                               {
+                                       if (w & 1)
+                                               tbm_mark_page_lossy(a, pg);
+                                       pg++;
+                                       w >>= 1;
+                               }
+                       }
+               }
+       }
+       else if (tbm_page_is_lossy(a, bpage->blockno))
+       {
+               /* page is already lossy in a, nothing to do */
+               return;
+       }
+       else
+       {
+               apage = tbm_get_pageentry(a, bpage->blockno);
+               if (apage->ischunk)
+               {
+                       /* The page is a lossy chunk header, set bit for itself */
+                       apage->words[0] |= ((bitmapword) 1 << 0);
+               }
+               else
+               {
+                       /* Both pages are exact, merge at the bit level */
+                       for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+                               apage->words[wordnum] |= bpage->words[wordnum];
+               }
+       }
+
+       if (a->nentries > a->maxentries)
+               tbm_lossify(a);
+}
+
+/*
+ * tbm_intersect - set intersection
+ *
+ * a is modified in-place, b is not changed
+ */
+void
+tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
+{
+       Assert(!a->iterating);
+       /* Nothing to do if a is empty */
+       if (a->nentries == 0)
+               return;
+       /* Scan through chunks and pages in a, try to match to b */
+       if (a->status == TBM_ONE_PAGE)
+       {
+               if (tbm_intersect_page(a, &a->entry1, b))
+               {
+                       /* Page is now empty, remove it from a */
+                       Assert(!a->entry1.ischunk);
+                       a->npages--;
+                       a->nentries--;
+                       Assert(a->nentries == 0);
+                       a->status = TBM_EMPTY;
+               }
+       }
+       else
+       {
+               HASH_SEQ_STATUS status;
+               PagetableEntry *apage;
+
+               Assert(a->status == TBM_HASH);
+               hash_seq_init(&status, a->pagetable);
+               while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+               {
+                       if (tbm_intersect_page(a, apage, b))
+                       {
+                               /* Page or chunk is now empty, remove it from a */
+                               if (apage->ischunk)
+                                       a->nchunks--;
+                               else
+                                       a->npages--;
+                               a->nentries--;
+                               if (hash_search(a->pagetable,
+                                                               (void *) &apage->blockno,
+                                                               HASH_REMOVE, NULL) == NULL)
+                                       elog(ERROR, "hash table corrupted");
+                       }
+               }
+       }
+}
+
+/*
+ * Process one page of a during an intersection op
+ *
+ * Returns TRUE if apage is now empty and should be deleted from a
+ */
+static bool
+tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
+{
+       const PagetableEntry *bpage;
+       int                     wordnum;
+
+       if (apage->ischunk)
+       {
+               /* Scan each bit in chunk, try to clear */
+               bool            candelete = true;
+
+               for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+               {
+                       bitmapword      w = apage->words[wordnum];
+
+                       if (w != 0)
+                       {
+                               bitmapword      neww = w;
+                               BlockNumber pg;
+                               int                     bitnum;
+
+                               pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
+                               bitnum = 0;
+                               while (w != 0)
+                               {
+                                       if (w & 1)
+                                       {
+                                               if (!tbm_page_is_lossy(b, pg) &&
+                                                       tbm_find_pageentry(b, pg) == NULL)
+                                               {
+                                                       /* Page is not in b at all, lose lossy bit */
+                                                       neww &= ~((bitmapword) 1 << bitnum);
+                                               }
+                                       }
+                                       pg++;
+                                       bitnum++;
+                                       w >>= 1;
+                               }
+                               apage->words[wordnum] = neww;
+                               if (neww != 0)
+                                       candelete = false;
+                       }
+               }
+               return candelete;
+       }
+       else if (tbm_page_is_lossy(b, apage->blockno))
+       {
+               /*
+                * When the page is lossy in b, we have to mark it lossy in a too. We
+                * know that no bits need be set in bitmap a, but we do not know which
+                * ones should be cleared, and we have no API for "at most these
+                * tuples need be checked".  (Perhaps it's worth adding that?)
+                */
+               tbm_mark_page_lossy(a, apage->blockno);
+
+               /*
+                * Note: tbm_mark_page_lossy will have removed apage from a, and may
+                * have inserted a new lossy chunk instead.  We can continue the same
+                * seq_search scan at the caller level, because it does not matter
+                * whether we visit such a new chunk or not: it will have only the bit
+                * for apage->blockno set, which is correct.
+                *
+                * We must return false here since apage was already deleted.
+                */
+               return false;
+       }
+       else
+       {
+               bool            candelete = true;
+
+               bpage = tbm_find_pageentry(b, apage->blockno);
+               if (bpage != NULL)
+               {
+                       /* Both pages are exact, merge at the bit level */
+                       Assert(!bpage->ischunk);
+                       for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+                       {
+                               apage->words[wordnum] &= bpage->words[wordnum];
+                               if (apage->words[wordnum] != 0)
+                                       candelete = false;
+                       }
+               }
+               return candelete;
+       }
+}
+
+/*
+ * tbm_is_empty - is a TIDBitmap completely empty?
+ */
+bool
+tbm_is_empty(const TIDBitmap *tbm)
+{
+       return (tbm->nentries == 0);
+}
+
+/*
+ * tbm_begin_iterate - prepare to iterate through a TIDBitmap
+ *
+ * NB: after this is called, it is no longer allowed to modify the contents
+ * of the bitmap.  However, you can call this multiple times to scan the
+ * contents repeatedly.
+ */
+void
+tbm_begin_iterate(TIDBitmap *tbm)
+{
+       HASH_SEQ_STATUS status;
+       PagetableEntry *page;
+       int                     npages;
+       int                     nchunks;
+
+       tbm->iterating = true;
+
+       /*
+        * Reset iteration pointers.
+        */
+       tbm->spageptr = 0;
+       tbm->schunkptr = 0;
+       tbm->schunkbit = 0;
+
+       /*
+        * Nothing else to do if no entries, nor if we don't have a hashtable.
+        */
+       if (tbm->nentries == 0 || tbm->status != TBM_HASH)
+               return;
+
+       /*
+        * Create and fill the sorted page lists if we didn't already.
+        */
+       if (!tbm->spages && tbm->npages > 0)
+               tbm->spages = (PagetableEntry **)
+                       MemoryContextAlloc(tbm->mcxt,
+                                                          tbm->npages * sizeof(PagetableEntry *));
+       if (!tbm->schunks && tbm->nchunks > 0)
+               tbm->schunks = (PagetableEntry **)
+                       MemoryContextAlloc(tbm->mcxt,
+                                                          tbm->nchunks * sizeof(PagetableEntry *));
+
+       hash_seq_init(&status, tbm->pagetable);
+       npages = nchunks = 0;
+       while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       {
+               if (page->ischunk)
+                       tbm->schunks[nchunks++] = page;
+               else
+                       tbm->spages[npages++] = page;
+       }
+       Assert(npages == tbm->npages);
+       Assert(nchunks == tbm->nchunks);
+       if (npages > 1)
+               qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
+       if (nchunks > 1)
+               qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
+}
+
+/*
+ * tbm_iterate - scan through next page of a TIDBitmap
+ *
+ * Returns a TBMIterateResult representing one page, or NULL if there are
+ * no more pages to scan.  Pages are guaranteed to be delivered in numerical
+ * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
+ * remember the exact tuples to look at on this page --- the caller must
+ * examine all tuples on the page and check if they meet the intended
+ * condition.
+ */
+TBMIterateResult *
+tbm_iterate(TIDBitmap *tbm)
+{
+       TBMIterateResult *output = &(tbm->output);
+
+       Assert(tbm->iterating);
+
+       /*
+        * If lossy chunk pages remain, make sure we've advanced schunkptr/
+        * schunkbit to the next set bit.
+        */
+       while (tbm->schunkptr < tbm->nchunks)
+       {
+               PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
+               int                     schunkbit = tbm->schunkbit;
+
+               while (schunkbit < PAGES_PER_CHUNK)
+               {
+                       int                     wordnum = WORDNUM(schunkbit);
+                       int                     bitnum = BITNUM(schunkbit);
+
+                       if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
+                               break;
+                       schunkbit++;
+               }
+               if (schunkbit < PAGES_PER_CHUNK)
+               {
+                       tbm->schunkbit = schunkbit;
+                       break;
+               }
+               /* advance to next chunk */
+               tbm->schunkptr++;
+               tbm->schunkbit = 0;
+       }
+
+       /*
+        * If both chunk and per-page data remain, must output the numerically
+        * earlier page.
+        */
+       if (tbm->schunkptr < tbm->nchunks)
+       {
+               PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
+               BlockNumber chunk_blockno;
+
+               chunk_blockno = chunk->blockno + tbm->schunkbit;
+               if (tbm->spageptr >= tbm->npages ||
+                       chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
+               {
+                       /* Return a lossy page indicator from the chunk */
+                       output->blockno = chunk_blockno;
+                       output->ntuples = -1;
+                       tbm->schunkbit++;
+                       return output;
+               }
+       }
+
+       if (tbm->spageptr < tbm->npages)
+       {
+               PagetableEntry *page;
+               int                     ntuples;
+               int                     wordnum;
+
+               /* In ONE_PAGE state, we don't allocate an spages[] array */
+               if (tbm->status == TBM_ONE_PAGE)
+                       page = &tbm->entry1;
+               else
+                       page = tbm->spages[tbm->spageptr];
+
+               /* scan bitmap to extract individual offset numbers */
+               ntuples = 0;
+               for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+               {
+                       bitmapword      w = page->words[wordnum];
+
+                       if (w != 0)
+                       {
+                               int                     off = wordnum * BITS_PER_BITMAPWORD + 1;
+
+                               while (w != 0)
+                               {
+                                       if (w & 1)
+                                               output->offsets[ntuples++] = (OffsetNumber) off;
+                                       off++;
+                                       w >>= 1;
+                               }
+                       }
+               }
+               output->blockno = page->blockno;
+               output->ntuples = ntuples;
+               tbm->spageptr++;
+               return output;
+       }
+
+       /* Nothing more in the bitmap */
+       return NULL;
+}
+
+/*
+ * tbm_find_pageentry - find a PagetableEntry for the pageno
+ *
+ * Returns NULL if there is no non-lossy entry for the pageno.
+ */
+static const PagetableEntry *
+tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
+{
+       const PagetableEntry *page;
+
+       if (tbm->nentries == 0)         /* in case pagetable doesn't exist */
+               return NULL;
+
+       if (tbm->status == TBM_ONE_PAGE)
+       {
+               page = &tbm->entry1;
+               if (page->blockno != pageno)
+                       return NULL;
+               Assert(!page->ischunk);
+               return page;
+       }
+
+       page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                 (void *) &pageno,
+                                                                                 HASH_FIND, NULL);
+       if (page == NULL)
+               return NULL;
+       if (page->ischunk)
+               return NULL;                    /* don't want a lossy chunk header */
+       return page;
+}
+
+/*
+ * tbm_get_pageentry - find or create a PagetableEntry for the pageno
+ *
+ * If new, the entry is marked as an exact (non-chunk) entry.
+ *
+ * This may cause the table to exceed the desired memory size. It is
+ * up to the caller to call tbm_lossify() at the next safe point if so.
+ */
+static PagetableEntry *
+tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
+{
+       PagetableEntry *page;
+       bool            found;
+
+       if (tbm->status == TBM_EMPTY)
+       {
+               /* Use the fixed slot */
+               page = &tbm->entry1;
+               found = false;
+               tbm->status = TBM_ONE_PAGE;
+       }
+       else
+       {
+               if (tbm->status == TBM_ONE_PAGE)
+               {
+                       page = &tbm->entry1;
+                       if (page->blockno == pageno)
+                               return page;
+                       /* Time to switch from one page to a hashtable */
+                       tbm_create_pagetable(tbm);
+               }
+
+               /* Look up or create an entry */
+               page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                         (void *) &pageno,
+                                                                                         HASH_ENTER, &found);
+       }
+
+       /* Initialize it if not present before */
+       if (!found)
+       {
+               MemSet(page, 0, sizeof(PagetableEntry));
+               page->blockno = pageno;
+               /* must count it too */
+               tbm->nentries++;
+               tbm->npages++;
+       }
+
+       return page;
+}
+
+/*
+ * tbm_page_is_lossy - is the page marked as lossily stored?
+ */
+static bool
+tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
+{
+       PagetableEntry *page;
+       BlockNumber chunk_pageno;
+       int                     bitno;
+
+       /* we can skip the lookup if there are no lossy chunks */
+       if (tbm->nchunks == 0)
+               return false;
+       Assert(tbm->status == TBM_HASH);
+
+       bitno = pageno % PAGES_PER_CHUNK;
+       chunk_pageno = pageno - bitno;
+       page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                 (void *) &chunk_pageno,
+                                                                                 HASH_FIND, NULL);
+       if (page != NULL && page->ischunk)
+       {
+               int                     wordnum = WORDNUM(bitno);
+               int                     bitnum = BITNUM(bitno);
+
+               if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
+                       return true;
+       }
+       return false;
+}
+
+/*
+ * tbm_mark_page_lossy - mark the page number as lossily stored
+ *
+ * This may cause the table to exceed the desired memory size. It is
+ * up to the caller to call tbm_lossify() at the next safe point if so.
+ */
+static void
+tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
+{
+       PagetableEntry *page;
+       bool            found;
+       BlockNumber chunk_pageno;
+       int                     bitno;
+       int                     wordnum;
+       int                     bitnum;
+
+       /* We force the bitmap into hashtable mode whenever it's lossy */
+       if (tbm->status != TBM_HASH)
+               tbm_create_pagetable(tbm);
+
+       bitno = pageno % PAGES_PER_CHUNK;
+       chunk_pageno = pageno - bitno;
+
+       /*
+        * Remove any extant non-lossy entry for the page.      If the page is its own
+        * chunk header, however, we skip this and handle the case below.
+        */
+       if (bitno != 0)
+       {
+               if (hash_search(tbm->pagetable,
+                                               (void *) &pageno,
+                                               HASH_REMOVE, NULL) != NULL)
+               {
+                       /* It was present, so adjust counts */
+                       tbm->nentries--;
+                       tbm->npages--;          /* assume it must have been non-lossy */
+               }
+       }
+
+       /* Look up or create entry for chunk-header page */
+       page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                 (void *) &chunk_pageno,
+                                                                                 HASH_ENTER, &found);
+
+       /* Initialize it if not present before */
+       if (!found)
+       {
+               MemSet(page, 0, sizeof(PagetableEntry));
+               page->blockno = chunk_pageno;
+               page->ischunk = true;
+               /* must count it too */
+               tbm->nentries++;
+               tbm->nchunks++;
+       }
+       else if (!page->ischunk)
+       {
+               /* chunk header page was formerly non-lossy, make it lossy */
+               MemSet(page, 0, sizeof(PagetableEntry));
+               page->blockno = chunk_pageno;
+               page->ischunk = true;
+               /* we assume it had some tuple bit(s) set, so mark it lossy */
+               page->words[0] = ((bitmapword) 1 << 0);
+               /* adjust counts */
+               tbm->nchunks++;
+               tbm->npages--;
+       }
+
+       /* Now set the original target page's bit */
+       wordnum = WORDNUM(bitno);
+       bitnum = BITNUM(bitno);
+       page->words[wordnum] |= ((bitmapword) 1 << bitnum);
+}
+
+/*
+ * tbm_lossify - lose some information to get back under the memory limit
+ */
+static void
+tbm_lossify(TIDBitmap *tbm)
+{
+       HASH_SEQ_STATUS status;
+       PagetableEntry *page;
+
+       /*
+        * XXX Really stupid implementation: this just lossifies pages in
+        * essentially random order.  We should be paying some attention to the
+        * number of bits set in each page, instead.  Also it might be a good idea
+        * to lossify more than the minimum number of pages during each call.
+        */
+       Assert(!tbm->iterating);
+       Assert(tbm->status == TBM_HASH);
+
+       hash_seq_init(&status, tbm->pagetable);
+       while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       {
+               if (page->ischunk)
+                       continue;                       /* already a chunk header */
+
+               /*
+                * If the page would become a chunk header, we won't save anything by
+                * converting it to lossy, so skip it.
+                */
+               if ((page->blockno % PAGES_PER_CHUNK) == 0)
+                       continue;
+
+               /* This does the dirty work ... */
+               tbm_mark_page_lossy(tbm, page->blockno);
+
+               if (tbm->nentries <= tbm->maxentries)
+               {
+                       /* we have done enough */
+                       hash_seq_term(&status);
+                       break;
+               }
+
+               /*
+                * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
+                * hashtable.  We can continue the same seq_search scan since we do
+                * not care whether we visit lossy chunks or not.
+                */
+       }
+}
+
+/*
+ * qsort comparator to handle PagetableEntry pointers.
+ */
+static int
+tbm_comparator(const void *left, const void *right)
+{
+       BlockNumber l = (*((const PagetableEntry **) left))->blockno;
+       BlockNumber r = (*((const PagetableEntry **) right))->blockno;
+
+       if (l < r)
+               return -1;
+       else if (l > r)
+               return 1;
+       return 0;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index b74b994c5fb97b9a5970342aea3029eee324a54d..97eff08e151baa8972f26fe2ed002b0c475a5ede 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -920,6 +920,7 @@ mdsync(void)
                                                                         entry->tag.rnode.spcNode,
                                                                         entry->tag.rnode.dbNode,
                                                                         entry->tag.rnode.relNode)));
+                                       hash_seq_term(&hstat);
                                         return false;
                                 }
                                 else
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c

index ca9b996900261c97784f371a898062b74b71255d..bacf61b066833a3d6be0e64126485a705fa15f90 100644 (file)
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -44,6 +44,7 @@
  
  #include "postgres.h"
  
+#include "access/xact.h"
  #include "storage/shmem.h"
  #include "utils/dynahash.h"
  #include "utils/hsearch.h"
@@ -71,6 +72,9 @@ static void hdefault(HTAB *hashp);
  static int     choose_nelem_alloc(Size entrysize);
  static bool init_htab(HTAB *hashp, long nelem);
  static void hash_corrupted(HTAB *hashp);
+static void register_seq_scan(HTAB *hashp);
+static void deregister_seq_scan(HTAB *hashp);
+static bool has_seq_scans(HTAB *hashp);
  
  
  /*
@@ -216,6 +220,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                                          errmsg("out of memory")));
         }
  
+       hashp->frozen = false;
+
         hdefault(hashp);
  
         hctl = hashp->hctl;
@@ -673,6 +679,10 @@ hash_search(HTAB *hashp,
                         if (currBucket != NULL)
                                 return (void *) ELEMENTKEY(currBucket);
  
+                       /* disallow inserts if frozen */
+                       if (hashp->frozen)
+                               elog(ERROR, "cannot insert into a frozen hashtable");
+
                         /* get the next free element */
                         currBucket = hctl->freeList;
                         if (currBucket == NULL)
@@ -709,8 +719,12 @@ hash_search(HTAB *hashp,
  
                         /* caller is expected to fill the data field on return */
  
-                       /* Check if it is time to split the segment */
-                       if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+                       /*
+                        * Check if it is time to split a bucket.  Can't split if table
+                        * is the subject of any active hash_seq_search scans.
+                        */
+                       if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+                               !has_seq_scans(hashp))
                         {
                                 /*
                                  * NOTE: failure to expand table is not a fatal error, it just
@@ -728,15 +742,25 @@ hash_search(HTAB *hashp,
  }
  
  /*
- * hash_seq_init/_search
+ * hash_seq_init/_search/_term
   *                     Sequentially search through hash table and return
   *                     all the elements one by one, return NULL when no more.
   *
+ * hash_seq_term should be called if and only if the scan is abandoned before
+ * completion; if hash_seq_search returns NULL then it has already done the
+ * end-of-scan cleanup.
+ *
   * NOTE: caller may delete the returned element before continuing the scan.
   * However, deleting any other element while the scan is in progress is
   * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
   * if elements are added to the table while the scan is in progress, it is
   * unspecified whether they will be visited by the scan or not.
+ *
+ * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
+ * worry about hash_seq_term cleanup, if the hashtable is first locked against
+ * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
+ * wherein it is inconvenient to track whether a scan is still open, and
+ * there's no possibility of further insertions after readout has begun.
   */
  void
  hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -744,6 +768,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
         status->hashp = hashp;
         status->curBucket = 0;
         status->curEntry = NULL;
+       if (!hashp->frozen)
+               register_seq_scan(hashp);
  }
  
  void *
@@ -778,7 +804,10 @@ hash_seq_search(HASH_SEQ_STATUS *status)
         max_bucket = hctl->max_bucket;
  
         if (curBucket > max_bucket)
+       {
+               hash_seq_term(status);
                 return NULL;                    /* search is done */
+       }
  
         /*
          * first find the right segment in the table directory.
@@ -800,6 +829,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
                 if (++curBucket > max_bucket)
                 {
                         status->curBucket = curBucket;
+                       hash_seq_term(status);
                         return NULL;            /* search is done */
                 }
                 if (++segment_ndx >= ssize)
@@ -818,6 +848,36 @@ hash_seq_search(HASH_SEQ_STATUS *status)
         return (void *) ELEMENTKEY(curElem);
  }
  
+void
+hash_seq_term(HASH_SEQ_STATUS *status)
+{
+       if (!status->hashp->frozen)
+               deregister_seq_scan(status->hashp);
+}
+
+/*
+ * hash_freeze
+ *                     Freeze a hashtable against future insertions (deletions are
+ *                     still allowed)
+ *
+ * The reason for doing this is that by preventing any more bucket splits,
+ * we no longer need to worry about registering hash_seq_search scans,
+ * and thus caller need not be careful about ensuring hash_seq_term gets
+ * called at the right times.
+ *
+ * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
+ * with active scans (since hash_seq_term would then do the wrong thing).
+ */
+void
+hash_freeze(HTAB *hashp)
+{
+       if (hashp->isshared)
+               elog(ERROR, "cannot freeze shared hashtable");
+       if (!hashp->frozen && has_seq_scans(hashp))
+               elog(ERROR, "cannot freeze hashtable with active scans");
+       hashp->frozen = true;
+}
+
  
  /********************************* UTILITIES ************************/
  
@@ -1031,3 +1091,136 @@ my_log2(long num)
                 ;
         return i;
  }
+
+
+/************************* SEQ SCAN TRACKING ************************/
+
+/*
+ * We track active hash_seq_search scans here.  The need for this mechanism
+ * comes from the fact that a scan will get confused if a bucket split occurs
+ * while it's in progress: it might visit entries twice, or even miss some
+ * entirely (if it's partway through the same bucket that splits).  Hence
+ * we want to inhibit bucket splits if there are any active scans on the
+ * table being inserted into.  This is a fairly rare case in current usage,
+ * so just postponing the split until the next insertion seems sufficient.
+ *
+ * Given present usages of the function, only a few scans are likely to be
+ * open concurrently; so a finite-size stack of open scans seems sufficient,
+ * and we don't worry that linear search is too slow.  Note that we do
+ * allow multiple scans of the same hashtable to be open concurrently.
+ *
+ * This mechanism can support concurrent scan and insertion in a shared
+ * hashtable if it's the same backend doing both.  It would fail otherwise,
+ * but locking reasons seem to preclude any such scenario anyway, so we don't
+ * worry.
+ *
+ * This arrangement is reasonably robust if a transient hashtable is deleted
+ * without notifying us.  The absolute worst case is we might inhibit splits
+ * in another table created later at exactly the same address.  We will give
+ * a warning at transaction end for reference leaks, so any bugs leading to
+ * lack of notification should be easy to catch.
+ */
+
+#define MAX_SEQ_SCANS 100
+
+static HTAB *seq_scan_tables[MAX_SEQ_SCANS];   /* tables being scanned */
+static int     seq_scan_level[MAX_SEQ_SCANS];          /* subtransaction nest level */
+static int     num_seq_scans = 0;
+
+
+/* Register a table as having an active hash_seq_search scan */
+static void
+register_seq_scan(HTAB *hashp)
+{
+       if (num_seq_scans >= MAX_SEQ_SCANS)
+               elog(ERROR, "too many active hash_seq_search scans");
+       seq_scan_tables[num_seq_scans] = hashp;
+       seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
+       num_seq_scans++;
+}
+
+/* Deregister an active scan */
+static void
+deregister_seq_scan(HTAB *hashp)
+{
+       int             i;
+
+       /* Search backward since it's most likely at the stack top */
+       for (i = num_seq_scans - 1; i >= 0; i--)
+       {
+               if (seq_scan_tables[i] == hashp)
+               {
+                       seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+                       seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+                       num_seq_scans--;
+                       return;
+               }
+       }
+       elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
+                hashp->tabname);
+}
+
+/* Check if a table has any active scan */
+static bool
+has_seq_scans(HTAB *hashp)
+{
+       int             i;
+
+       for (i = 0; i < num_seq_scans; i++)
+       {
+               if (seq_scan_tables[i] == hashp)
+                       return true;
+       }
+       return false;
+}
+
+/* Clean up any open scans at end of transaction */
+void
+AtEOXact_HashTables(bool isCommit)
+{
+       /*
+        * During abort cleanup, open scans are expected; just silently clean 'em
+        * out.  An open scan at commit means someone forgot a hash_seq_term()
+        * call, so complain.
+        *
+        * Note: it's tempting to try to print the tabname here, but refrain for
+        * fear of touching deallocated memory.  This isn't a user-facing message
+        * anyway, so it needn't be pretty.
+        */
+       if (isCommit)
+       {
+               int             i;
+
+               for (i = 0; i < num_seq_scans; i++)
+               {
+                       elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                                seq_scan_tables[i]);
+               }
+       }
+       num_seq_scans = 0;
+}
+
+/* Clean up any open scans at end of subtransaction */
+void
+AtEOSubXact_HashTables(bool isCommit, int nestDepth)
+{
+       int             i;
+
+       /*
+        * Search backward to make cleanup easy.  Note we must check all entries,
+        * not only those at the end of the array, because deletion technique
+        * doesn't keep them in order.
+        */
+       for (i = num_seq_scans - 1; i >= 0; i--)
+       {
+               if (seq_scan_level[i] >= nestDepth)
+               {
+                       if (isCommit)
+                               elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                                        seq_scan_tables[i]);
+                       seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+                       seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+                       num_seq_scans--;
+               }
+       }
+}
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c

index 93478788d64c221d0bcc8591bd2bb2164ece772f..b9a8dc798a5e8fe50632714748effcb5c546dbfb 100644 (file)
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -548,7 +548,9 @@ AtCommit_Portals(void)
                 /* Zap all non-holdable portals */
                 PortalDrop(portal, true);
  
-               /* Restart the iteration */
+               /* Restart the iteration in case that led to other drops */
+               /* XXX is this really necessary? */
+               hash_seq_term(&status);
                 hash_seq_init(&status, PortalHashTable);
         }
  }
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index 3fe2886107e7b583864dc809aedb806d990bbd55..24dc0509523157fa29c79c02a48d5121ea9aa585 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -382,8 +382,20 @@ typedef struct TupleHashTableData
  
  typedef HASH_SEQ_STATUS TupleHashIterator;
  
-#define ResetTupleHashIterator(htable, iter) \
+/*
+ * Use InitTupleHashIterator/TermTupleHashIterator for a read/write scan.
+ * Use ResetTupleHashIterator if the table can be frozen (in this case no
+ * explicit scan termination is needed).
+ */
+#define InitTupleHashIterator(htable, iter) \
         hash_seq_init(iter, (htable)->hashtab)
+#define TermTupleHashIterator(iter) \
+       hash_seq_term(iter)
+#define ResetTupleHashIterator(htable, iter) \
+       do { \
+               hash_freeze((htable)->hashtab); \
+               hash_seq_init(iter, (htable)->hashtab); \
+       } while (0)
  #define ScanTupleHashTable(iter) \
         ((TupleHashEntry) hash_seq_search(iter))
  
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h

index 49fb53046f564f780d1f01cbac375684c39af35b..f845775b9dbcf44278ab9f625aa620c7c4f8c47d 100644 (file)
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -119,6 +119,8 @@ typedef struct HTAB
         MemoryContext hcxt;                     /* memory context if default allocator used */
         char       *tabname;            /* table name (for error messages) */
         bool            isshared;               /* true if table is in shared memory */
+       /* freezing a shared table isn't allowed, so we can keep state here */
+       bool            frozen;                 /* true = no more inserts allowed */
  } HTAB;
  
  /* Parameter data structure for hash_create */
@@ -185,8 +187,12 @@ extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action,
                         bool *foundPtr);
  extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
  extern void *hash_seq_search(HASH_SEQ_STATUS *status);
+extern void hash_seq_term(HASH_SEQ_STATUS *status);
+extern void hash_freeze(HTAB *hashp);
  extern Size hash_estimate_size(long num_entries, Size entrysize);
  extern long hash_select_dirsize(long num_entries);
+extern void AtEOXact_HashTables(bool isCommit);
+extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
  
  /*
   * prototypes for functions in hashfn.c
author	Tom Lane <[email protected]>
	Thu, 26 Apr 2007 23:25:09 +0000 (23:25 +0000)
committer	Tom Lane <[email protected]>
	Thu, 26 Apr 2007 23:25:09 +0000 (23:25 +0000)
src/backend/access/transam/xact.c		patch \| blob \| blame \| history
src/backend/executor/nodeSubplan.c		patch \| blob \| blame \| history
src/backend/nodes/tidbitmap.c	[new file with mode: 0644]	patch \| blob
src/backend/storage/smgr/md.c		patch \| blob \| blame \| history
src/backend/utils/hash/dynahash.c		patch \| blob \| blame \| history
src/backend/utils/mmgr/portalmem.c		patch \| blob \| blame \| history
src/include/nodes/execnodes.h		patch \| blob \| blame \| history
src/include/utils/hsearch.h		patch \| blob \| blame \| history