Fix dynahash.c to suppress hash bucket splits while a hash_seq_search() scan

author Tom Lane <[email protected]>

Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)

committer Tom Lane <[email protected]>

Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)
author Tom Lane <[email protected]>
Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)
committer Tom Lane <[email protected]>
Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index 61442b5566912f78c98cae62fc8272da63746b3f..1e4b409fc71021f94f2da1ffc3d487647e3044cb 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1627,6 +1627,7 @@ CommitTransaction(void)
         AtEOXact_Namespace(true);
         /* smgrcommit already done */
         AtEOXact_Files();
+       AtEOXact_HashTables(true);
         pgstat_count_xact_commit();
  
         CurrentResourceOwner = NULL;
@@ -1842,6 +1843,7 @@ PrepareTransaction(void)
         AtEOXact_Namespace(true);
         /* smgrcommit already done */
         AtEOXact_Files();
+       AtEOXact_HashTables(true);
  
         CurrentResourceOwner = NULL;
         ResourceOwnerDelete(TopTransactionResourceOwner);
@@ -1993,6 +1995,7 @@ AbortTransaction(void)
         AtEOXact_Namespace(false);
         smgrabort();
         AtEOXact_Files();
+       AtEOXact_HashTables(false);
         pgstat_count_xact_rollback();
  
         /*
@@ -3704,6 +3707,7 @@ CommitSubTransaction(void)
                                                   s->parent->subTransactionId);
         AtEOSubXact_Files(true, s->subTransactionId,
                                           s->parent->subTransactionId);
+       AtEOSubXact_HashTables(true, s->nestingLevel);
  
         /*
          * We need to restore the upper transaction's read-only state, in case the
@@ -3815,6 +3819,7 @@ AbortSubTransaction(void)
                                                           s->parent->subTransactionId);
                 AtEOSubXact_Files(false, s->subTransactionId,
                                                   s->parent->subTransactionId);
+               AtEOSubXact_HashTables(false, s->nestingLevel);
         }
  
         /*
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c

index 0a3d41349b7da758933b7e72c441216eac7789c2..90a494c1c7b788913ce0af2050c4d082bb8cad3b 100644 (file)
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -21,7 +21,7 @@
  #include "catalog/pg_type.h"
  #include "commands/explain.h"
  #include "commands/prepare.h"
-#include "funcapi.h"
+#include "miscadmin.h"
  #include "rewrite/rewriteHandler.h"
  #include "tcop/pquery.h"
  #include "tcop/tcopprot.h"
@@ -674,91 +674,98 @@ ExplainExecuteQuery(ExplainStmt *stmt, ParamListInfo params,
  Datum
  pg_prepared_statement(PG_FUNCTION_ARGS)
  {
-       FuncCallContext *funcctx;
-       HASH_SEQ_STATUS *hash_seq;
-       PreparedStatement *prep_stmt;
-
-       /* stuff done only on the first call of the function */
-       if (SRF_IS_FIRSTCALL())
-       {
-               TupleDesc       tupdesc;
-               MemoryContext oldcontext;
-
-               /* create a function context for cross-call persistence */
-               funcctx = SRF_FIRSTCALL_INIT();
+       ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+       TupleDesc       tupdesc;
+       Tuplestorestate *tupstore;
+       MemoryContext per_query_ctx;
+       MemoryContext oldcontext;
+
+       /* check to see if caller supports us returning a tuplestore */
+       if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("set-valued function called in context that cannot accept a set")));
+       if (!(rsinfo->allowedModes & SFRM_Materialize))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("materialize mode required, but it is not " \
+                                               "allowed in this context")));
  
-               /*
-                * switch to memory context appropriate for multiple function calls
-                */
-               oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+       /* need to build tuplestore in query context */
+       per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+       oldcontext = MemoryContextSwitchTo(per_query_ctx);
  
-               /* allocate memory for user context */
-               if (prepared_queries)
-               {
-                       hash_seq = (HASH_SEQ_STATUS *) palloc(sizeof(HASH_SEQ_STATUS));
-                       hash_seq_init(hash_seq, prepared_queries);
-                       funcctx->user_fctx = (void *) hash_seq;
-               }
-               else
-                       funcctx->user_fctx = NULL;
+       /*
+        * build tupdesc for result tuples. This must match the definition of
+        * the pg_prepared_statements view in system_views.sql
+        */
+       tupdesc = CreateTemplateTupleDesc(5, false);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepare_time",
+                                          TIMESTAMPTZOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 4, "parameter_types",
+                                          REGTYPEARRAYOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 5, "from_sql",
+                                          BOOLOID, -1, 0);
  
-               /*
-                * build tupdesc for result tuples. This must match the definition of
-                * the pg_prepared_statements view in system_views.sql
-                */
-               tupdesc = CreateTemplateTupleDesc(5, false);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepare_time",
-                                                  TIMESTAMPTZOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 4, "parameter_types",
-                                                  REGTYPEARRAYOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 5, "from_sql",
-                                                  BOOLOID, -1, 0);
-
-               funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-               MemoryContextSwitchTo(oldcontext);
-       }
+       /*
+        * We put all the tuples into a tuplestore in one scan of the hashtable.
+        * This avoids any issue of the hashtable possibly changing between calls.
+        */
+       tupstore = tuplestore_begin_heap(true, false, work_mem);
  
-       /* stuff done on every call of the function */
-       funcctx = SRF_PERCALL_SETUP();
-       hash_seq = (HASH_SEQ_STATUS *) funcctx->user_fctx;
+       /* hash table might be uninitialized */
+       if (prepared_queries)
+       {
+               HASH_SEQ_STATUS hash_seq;
+               PreparedStatement *prep_stmt;
  
-       /* if the hash table is uninitialized, we're done */
-       if (hash_seq == NULL)
-               SRF_RETURN_DONE(funcctx);
+               hash_seq_init(&hash_seq, prepared_queries);
+               while ((prep_stmt = hash_seq_search(&hash_seq)) != NULL)
+               {
+                       HeapTuple       tuple;
+                       Datum           values[5];
+                       bool            nulls[5];
  
-       prep_stmt = hash_seq_search(hash_seq);
-       if (prep_stmt)
-       {
-               Datum           result;
-               HeapTuple       tuple;
-               Datum           values[5];
-               bool            nulls[5];
+                       /* generate junk in short-term context */
+                       MemoryContextSwitchTo(oldcontext);
  
-               MemSet(nulls, 0, sizeof(nulls));
+                       MemSet(nulls, 0, sizeof(nulls));
  
-               values[0] = DirectFunctionCall1(textin,
+                       values[0] = DirectFunctionCall1(textin,
                                                                           CStringGetDatum(prep_stmt->stmt_name));
  
-               if (prep_stmt->query_string == NULL)
-                       nulls[1] = true;
-               else
-                       values[1] = DirectFunctionCall1(textin,
+                       if (prep_stmt->query_string == NULL)
+                               nulls[1] = true;
+                       else
+                               values[1] = DirectFunctionCall1(textin,
                                                                    CStringGetDatum(prep_stmt->query_string));
  
-               values[2] = TimestampTzGetDatum(prep_stmt->prepare_time);
-               values[3] = build_regtype_array(prep_stmt->argtype_list);
-               values[4] = BoolGetDatum(prep_stmt->from_sql);
+                       values[2] = TimestampTzGetDatum(prep_stmt->prepare_time);
+                       values[3] = build_regtype_array(prep_stmt->argtype_list);
+                       values[4] = BoolGetDatum(prep_stmt->from_sql);
  
-               tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-               result = HeapTupleGetDatum(tuple);
-               SRF_RETURN_NEXT(funcctx, result);
+                       tuple = heap_form_tuple(tupdesc, values, nulls);
+
+                       /* switch to appropriate context while storing the tuple */
+                       MemoryContextSwitchTo(per_query_ctx);
+                       tuplestore_puttuple(tupstore, tuple);
+               }
         }
  
-       SRF_RETURN_DONE(funcctx);
+       /* clean up and return the tuplestore */
+       tuplestore_donestoring(tupstore);
+
+       MemoryContextSwitchTo(oldcontext);
+
+       rsinfo->returnMode = SFRM_Materialize;
+       rsinfo->setResult = tupstore;
+       rsinfo->setDesc = tupdesc;
+
+       return (Datum) 0;
  }
  
  /*
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c

index c29aef9e035f78c641d1c525d061633c5dba03cd..d81f9c3a4a5f89f3d17b39edaee7a5f3f8582c89 100644 (file)
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -569,7 +569,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
         TupleHashIterator hashiter;
         TupleHashEntry entry;
  
-       ResetTupleHashIterator(hashtable, &hashiter);
+       InitTupleHashIterator(hashtable, &hashiter);
         while ((entry = ScanTupleHashTable(&hashiter)) != NULL)
         {
                 ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
@@ -577,8 +577,12 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot)
                                                            numCols, keyColIdx,
                                                            hashtable->eqfunctions,
                                                            hashtable->tempcxt))
+               {
+                       TermTupleHashIterator(&hashiter);
                         return true;
+               }
         }
+       /* No TermTupleHashIterator call needed here */
         return false;
  }
  
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c

index a3a02e2f3158e2622f97a41ea8538acc8088f248..6c3af9ad5515b23f9391111e5c3721466a23a8d1 100644 (file)
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -907,7 +907,11 @@ tbm_lossify(TIDBitmap *tbm)
                 tbm_mark_page_lossy(tbm, page->blockno);
  
                 if (tbm->nentries <= tbm->maxentries)
-                       return;                         /* we have done enough */
+               {
+                       /* we have done enough */
+                       hash_seq_term(&status);
+                       break;
+               }
  
                 /*
                  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index c78c552d86aaa3a25ce3b57263478c331790670b..176d462c4be581429d42634e30be0c4903fe0c3b 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -919,6 +919,7 @@ mdsync(void)
                                                                         entry->tag.rnode.spcNode,
                                                                         entry->tag.rnode.dbNode,
                                                                         entry->tag.rnode.relNode)));
+                                       hash_seq_term(&hstat);
                                         return false;
                                 }
                                 else
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c

index ac5d82469c89fae883b509ccd36860e45cb09bbe..66a9ff370725b8b7825d3aa2040e5e1b68fad3ba 100644 (file)
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -63,6 +63,7 @@
  
  #include "postgres.h"
  
+#include "access/xact.h"
  #include "storage/shmem.h"
  #include "storage/spin.h"
  #include "utils/dynahash.h"
@@ -160,6 +161,9 @@ struct HTAB
         char       *tabname;            /* table name (for error messages) */
         bool            isshared;               /* true if table is in shared memory */
  
+       /* freezing a shared table isn't allowed, so we can keep state here */
+       bool            frozen;                 /* true = no more inserts allowed */
+
         /* We keep local copies of these fixed values to reduce contention */
         Size            keysize;                /* hash key length in bytes */
         long            ssize;                  /* segment size --- must be power of 2 */
@@ -195,6 +199,9 @@ static void hdefault(HTAB *hashp);
  static int     choose_nelem_alloc(Size entrysize);
  static bool init_htab(HTAB *hashp, long nelem);
  static void hash_corrupted(HTAB *hashp);
+static void register_seq_scan(HTAB *hashp);
+static void deregister_seq_scan(HTAB *hashp);
+static bool has_seq_scans(HTAB *hashp);
  
  
  /*
@@ -356,6 +363,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
                                          errmsg("out of memory")));
         }
  
+       hashp->frozen = false;
+
         hdefault(hashp);
  
         hctl = hashp->hctl;
@@ -898,6 +907,10 @@ hash_search_with_hash_value(HTAB *hashp,
                         if (currBucket != NULL)
                                 return (void *) ELEMENTKEY(currBucket);
  
+                       /* disallow inserts if frozen */
+                       if (hashp->frozen)
+                               elog(ERROR, "cannot insert into a frozen hashtable");
+
                         currBucket = get_hash_entry(hashp);
                         if (currBucket == NULL)
                         {
@@ -925,10 +938,15 @@ hash_search_with_hash_value(HTAB *hashp,
  
                         /* caller is expected to fill the data field on return */
  
-                       /* Check if it is time to split a bucket */
-                       /* Can't split if running in partitioned mode */
+                       /*
+                        * Check if it is time to split a bucket.  Can't split if running
+                        * in partitioned mode, nor if table is the subject of any active
+                        * hash_seq_search scans.  Strange order of these tests is to try
+                        * to check cheaper conditions first.
+                        */
                         if (!IS_PARTITIONED(hctl) &&
-                        hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+                               hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+                               !has_seq_scans(hashp))
                         {
                                 /*
                                  * NOTE: failure to expand table is not a fatal error, it just
@@ -1001,18 +1019,30 @@ hash_get_num_entries(HTAB *hashp)
  }
  
  /*
- * hash_seq_init/_search
+ * hash_seq_init/_search/_term
   *                     Sequentially search through hash table and return
   *                     all the elements one by one, return NULL when no more.
   *
+ * hash_seq_term should be called if and only if the scan is abandoned before
+ * completion; if hash_seq_search returns NULL then it has already done the
+ * end-of-scan cleanup.
+ *
   * NOTE: caller may delete the returned element before continuing the scan.
   * However, deleting any other element while the scan is in progress is
   * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
   * if elements are added to the table while the scan is in progress, it is
   * unspecified whether they will be visited by the scan or not.
   *
+ * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
+ * worry about hash_seq_term cleanup, if the hashtable is first locked against
+ * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
+ * wherein it is inconvenient to track whether a scan is still open, and
+ * there's no possibility of further insertions after readout has begun.
+ *
   * NOTE: to use this with a partitioned hashtable, caller had better hold
   * at least shared lock on all partitions of the table throughout the scan!
+ * We can cope with insertions or deletions by our own backend, but *not*
+ * with concurrent insertions or deletions by another.
   */
  void
  hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -1020,6 +1050,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
         status->hashp = hashp;
         status->curBucket = 0;
         status->curEntry = NULL;
+       if (!hashp->frozen)
+               register_seq_scan(hashp);
  }
  
  void *
@@ -1054,7 +1086,10 @@ hash_seq_search(HASH_SEQ_STATUS *status)
         max_bucket = hctl->max_bucket;
  
         if (curBucket > max_bucket)
+       {
+               hash_seq_term(status);
                 return NULL;                    /* search is done */
+       }
  
         /*
          * first find the right segment in the table directory.
@@ -1076,6 +1111,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
                 if (++curBucket > max_bucket)
                 {
                         status->curBucket = curBucket;
+                       hash_seq_term(status);
                         return NULL;            /* search is done */
                 }
                 if (++segment_ndx >= ssize)
@@ -1094,6 +1130,36 @@ hash_seq_search(HASH_SEQ_STATUS *status)
         return (void *) ELEMENTKEY(curElem);
  }
  
+void
+hash_seq_term(HASH_SEQ_STATUS *status)
+{
+       if (!status->hashp->frozen)
+               deregister_seq_scan(status->hashp);
+}
+
+/*
+ * hash_freeze
+ *                     Freeze a hashtable against future insertions (deletions are
+ *                     still allowed)
+ *
+ * The reason for doing this is that by preventing any more bucket splits,
+ * we no longer need to worry about registering hash_seq_search scans,
+ * and thus caller need not be careful about ensuring hash_seq_term gets
+ * called at the right times.
+ *
+ * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
+ * with active scans (since hash_seq_term would then do the wrong thing).
+ */
+void
+hash_freeze(HTAB *hashp)
+{
+       if (hashp->isshared)
+               elog(ERROR, "cannot freeze shared hashtable");
+       if (!hashp->frozen && has_seq_scans(hashp))
+               elog(ERROR, "cannot freeze hashtable with active scans");
+       hashp->frozen = true;
+}
+
  
  /********************************* UTILITIES ************************/
  
@@ -1324,3 +1390,136 @@ my_log2(long num)
                 ;
         return i;
  }
+
+
+/************************* SEQ SCAN TRACKING ************************/
+
+/*
+ * We track active hash_seq_search scans here.  The need for this mechanism
+ * comes from the fact that a scan will get confused if a bucket split occurs
+ * while it's in progress: it might visit entries twice, or even miss some
+ * entirely (if it's partway through the same bucket that splits).  Hence
+ * we want to inhibit bucket splits if there are any active scans on the
+ * table being inserted into.  This is a fairly rare case in current usage,
+ * so just postponing the split until the next insertion seems sufficient.
+ *
+ * Given present usages of the function, only a few scans are likely to be
+ * open concurrently; so a finite-size stack of open scans seems sufficient,
+ * and we don't worry that linear search is too slow.  Note that we do
+ * allow multiple scans of the same hashtable to be open concurrently.
+ *
+ * This mechanism can support concurrent scan and insertion in a shared
+ * hashtable if it's the same backend doing both.  It would fail otherwise,
+ * but locking reasons seem to preclude any such scenario anyway, so we don't
+ * worry.
+ *
+ * This arrangement is reasonably robust if a transient hashtable is deleted
+ * without notifying us.  The absolute worst case is we might inhibit splits
+ * in another table created later at exactly the same address.  We will give
+ * a warning at transaction end for reference leaks, so any bugs leading to
+ * lack of notification should be easy to catch.
+ */
+
+#define MAX_SEQ_SCANS 100
+
+static HTAB *seq_scan_tables[MAX_SEQ_SCANS];   /* tables being scanned */
+static int     seq_scan_level[MAX_SEQ_SCANS];          /* subtransaction nest level */
+static int     num_seq_scans = 0;
+
+
+/* Register a table as having an active hash_seq_search scan */
+static void
+register_seq_scan(HTAB *hashp)
+{
+       if (num_seq_scans >= MAX_SEQ_SCANS)
+               elog(ERROR, "too many active hash_seq_search scans");
+       seq_scan_tables[num_seq_scans] = hashp;
+       seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
+       num_seq_scans++;
+}
+
+/* Deregister an active scan */
+static void
+deregister_seq_scan(HTAB *hashp)
+{
+       int             i;
+
+       /* Search backward since it's most likely at the stack top */
+       for (i = num_seq_scans - 1; i >= 0; i--)
+       {
+               if (seq_scan_tables[i] == hashp)
+               {
+                       seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+                       seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+                       num_seq_scans--;
+                       return;
+               }
+       }
+       elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
+                hashp->tabname);
+}
+
+/* Check if a table has any active scan */
+static bool
+has_seq_scans(HTAB *hashp)
+{
+       int             i;
+
+       for (i = 0; i < num_seq_scans; i++)
+       {
+               if (seq_scan_tables[i] == hashp)
+                       return true;
+       }
+       return false;
+}
+
+/* Clean up any open scans at end of transaction */
+void
+AtEOXact_HashTables(bool isCommit)
+{
+       /*
+        * During abort cleanup, open scans are expected; just silently clean 'em
+        * out.  An open scan at commit means someone forgot a hash_seq_term()
+        * call, so complain.
+        *
+        * Note: it's tempting to try to print the tabname here, but refrain for
+        * fear of touching deallocated memory.  This isn't a user-facing message
+        * anyway, so it needn't be pretty.
+        */
+       if (isCommit)
+       {
+               int             i;
+
+               for (i = 0; i < num_seq_scans; i++)
+               {
+                       elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                                seq_scan_tables[i]);
+               }
+       }
+       num_seq_scans = 0;
+}
+
+/* Clean up any open scans at end of subtransaction */
+void
+AtEOSubXact_HashTables(bool isCommit, int nestDepth)
+{
+       int             i;
+
+       /*
+        * Search backward to make cleanup easy.  Note we must check all entries,
+        * not only those at the end of the array, because deletion technique
+        * doesn't keep them in order.
+        */
+       for (i = num_seq_scans - 1; i >= 0; i--)
+       {
+               if (seq_scan_level[i] >= nestDepth)
+               {
+                       if (isCommit)
+                               elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+                                        seq_scan_tables[i]);
+                       seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+                       seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+                       num_seq_scans--;
+               }
+       }
+}
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c

index 8392185e6a8f5cba28f46dfe9a7e110e26c48fe6..2c03c62e0827169af99f6287fbad4b86e284562b 100644 (file)
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -22,7 +22,6 @@
  #include "access/xact.h"
  #include "catalog/pg_type.h"
  #include "commands/portalcmds.h"
-#include "funcapi.h"
  #include "miscadmin.h"
  #include "utils/builtins.h"
  #include "utils/memutils.h"
@@ -577,7 +576,9 @@ AtCommit_Portals(void)
                 /* Zap all non-holdable portals */
                 PortalDrop(portal, true);
  
-               /* Restart the iteration */
+               /* Restart the iteration in case that led to other drops */
+               /* XXX is this really necessary? */
+               hash_seq_term(&status);
                 hash_seq_init(&status, PortalHashTable);
         }
  }
@@ -806,79 +807,68 @@ AtSubCleanup_Portals(SubTransactionId mySubid)
  Datum
  pg_cursor(PG_FUNCTION_ARGS)
  {
-       FuncCallContext *funcctx;
-       HASH_SEQ_STATUS *hash_seq;
+       ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+       TupleDesc       tupdesc;
+       Tuplestorestate *tupstore;
+       MemoryContext per_query_ctx;
+       MemoryContext oldcontext;
+       HASH_SEQ_STATUS hash_seq;
         PortalHashEnt *hentry;
  
-       /* stuff done only on the first call of the function */
-       if (SRF_IS_FIRSTCALL())
-       {
-               MemoryContext oldcontext;
-               TupleDesc       tupdesc;
-
-               /* create a function context for cross-call persistence */
-               funcctx = SRF_FIRSTCALL_INIT();
-
-               /*
-                * switch to memory context appropriate for multiple function calls
-                */
-               oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-               if (PortalHashTable)
-               {
-                       hash_seq = (HASH_SEQ_STATUS *) palloc(sizeof(HASH_SEQ_STATUS));
-                       hash_seq_init(hash_seq, PortalHashTable);
-                       funcctx->user_fctx = (void *) hash_seq;
-               }
-               else
-                       funcctx->user_fctx = NULL;
-
-               /*
-                * build tupdesc for result tuples. This must match the definition of
-                * the pg_cursors view in system_views.sql
-                */
-               tupdesc = CreateTemplateTupleDesc(6, false);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
-                                                  TEXTOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable",
-                                                  BOOLOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary",
-                                                  BOOLOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable",
-                                                  BOOLOID, -1, 0);
-               TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time",
-                                                  TIMESTAMPTZOID, -1, 0);
-
-               funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-               MemoryContextSwitchTo(oldcontext);
-       }
-
-       /* stuff done on every call of the function */
-       funcctx = SRF_PERCALL_SETUP();
-       hash_seq = (HASH_SEQ_STATUS *) funcctx->user_fctx;
+       /* check to see if caller supports us returning a tuplestore */
+       if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("set-valued function called in context that cannot accept a set")));
+       if (!(rsinfo->allowedModes & SFRM_Materialize))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("materialize mode required, but it is not " \
+                                               "allowed in this context")));
+
+       /* need to build tuplestore in query context */
+       per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+       oldcontext = MemoryContextSwitchTo(per_query_ctx);
  
-       /* if the hash table is uninitialized, we're done */
-       if (hash_seq == NULL)
-               SRF_RETURN_DONE(funcctx);
+       /*
+        * build tupdesc for result tuples. This must match the definition of
+        * the pg_cursors view in system_views.sql
+        */
+       tupdesc = CreateTemplateTupleDesc(6, false);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
+                                          TEXTOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable",
+                                          BOOLOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary",
+                                          BOOLOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable",
+                                          BOOLOID, -1, 0);
+       TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time",
+                                          TIMESTAMPTZOID, -1, 0);
  
-       /* loop until we find a visible portal or hit the end of the list */
-       while ((hentry = hash_seq_search(hash_seq)) != NULL)
-       {
-               if (hentry->portal->visible)
-                       break;
-       }
+       /*
+        * We put all the tuples into a tuplestore in one scan of the hashtable.
+        * This avoids any issue of the hashtable possibly changing between calls.
+        */
+       tupstore = tuplestore_begin_heap(true, false, work_mem);
  
-       if (hentry)
+       hash_seq_init(&hash_seq, PortalHashTable);
+       while ((hentry = hash_seq_search(&hash_seq)) != NULL)
         {
-               Portal          portal;
-               Datum           result;
+               Portal          portal = hentry->portal;
                 HeapTuple       tuple;
                 Datum           values[6];
                 bool            nulls[6];
  
-               portal = hentry->portal;
+               /* report only "visible" entries */
+               if (!portal->visible)
+                       continue;
+
+               /* generate junk in short-term context */
+               MemoryContextSwitchTo(oldcontext);
+
                 MemSet(nulls, 0, sizeof(nulls));
  
                 values[0] = DirectFunctionCall1(textin, CStringGetDatum(portal->name));
@@ -892,10 +882,21 @@ pg_cursor(PG_FUNCTION_ARGS)
                 values[4] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_SCROLL);
                 values[5] = TimestampTzGetDatum(portal->creation_time);
  
-               tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-               result = HeapTupleGetDatum(tuple);
-               SRF_RETURN_NEXT(funcctx, result);
+               tuple = heap_form_tuple(tupdesc, values, nulls);
+
+               /* switch to appropriate context while storing the tuple */
+               MemoryContextSwitchTo(per_query_ctx);
+               tuplestore_puttuple(tupstore, tuple);
         }
  
-       SRF_RETURN_DONE(funcctx);
+       /* clean up and return the tuplestore */
+       tuplestore_donestoring(tupstore);
+
+       MemoryContextSwitchTo(oldcontext);
+
+       rsinfo->returnMode = SFRM_Materialize;
+       rsinfo->setResult = tupstore;
+       rsinfo->setDesc = tupdesc;
+
+       return (Datum) 0;
  }
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index a7123f9f711fbc149b47f6f9098ee72da83f60aa..68814f461ed97731ad9846deb863edef208165b5 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -390,8 +390,20 @@ typedef struct TupleHashTableData
  
  typedef HASH_SEQ_STATUS TupleHashIterator;
  
-#define ResetTupleHashIterator(htable, iter) \
+/*
+ * Use InitTupleHashIterator/TermTupleHashIterator for a read/write scan.
+ * Use ResetTupleHashIterator if the table can be frozen (in this case no
+ * explicit scan termination is needed).
+ */
+#define InitTupleHashIterator(htable, iter) \
         hash_seq_init(iter, (htable)->hashtab)
+#define TermTupleHashIterator(iter) \
+       hash_seq_term(iter)
+#define ResetTupleHashIterator(htable, iter) \
+       do { \
+               hash_freeze((htable)->hashtab); \
+               hash_seq_init(iter, (htable)->hashtab); \
+       } while (0)
  #define ScanTupleHashTable(iter) \
         ((TupleHashEntry) hash_seq_search(iter))
  
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h

index 8539776e52646b21820e4401875e23d2f170eba0..74ea23f5e6afc2fa1e3564ccbe6b9f2cb3a198a0 100644 (file)
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -130,9 +130,13 @@ extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
  extern long hash_get_num_entries(HTAB *hashp);
  extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
  extern void *hash_seq_search(HASH_SEQ_STATUS *status);
+extern void hash_seq_term(HASH_SEQ_STATUS *status);
+extern void hash_freeze(HTAB *hashp);
  extern Size hash_estimate_size(long num_entries, Size entrysize);
  extern long hash_select_dirsize(long num_entries);
  extern Size hash_get_shared_size(HASHCTL *info, int flags);
+extern void AtEOXact_HashTables(bool isCommit);
+extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
  
  /*
   * prototypes for functions in hashfn.c
author	Tom Lane <[email protected]>
	Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)
committer	Tom Lane <[email protected]>
	Thu, 26 Apr 2007 23:24:57 +0000 (23:24 +0000)
src/backend/access/transam/xact.c		patch \| blob \| blame \| history
src/backend/commands/prepare.c		patch \| blob \| blame \| history
src/backend/executor/nodeSubplan.c		patch \| blob \| blame \| history
src/backend/nodes/tidbitmap.c		patch \| blob \| blame \| history
src/backend/storage/smgr/md.c		patch \| blob \| blame \| history
src/backend/utils/hash/dynahash.c		patch \| blob \| blame \| history
src/backend/utils/mmgr/portalmem.c		patch \| blob \| blame \| history
src/include/nodes/execnodes.h		patch \| blob \| blame \| history
src/include/utils/hsearch.h		patch \| blob \| blame \| history