From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 13 Jul 2012 05:38:37 +0000 (-0400)
Subject: More work on chash.  Initialization theoretically works now.
X-Git-Url: http://git.postgresql.org/gitweb/static/developers.postgresql.org?a=commitdiff_plain;h=ffbb0b80a2ae4b2fbbd1f80b72d9d627801a7a9f;p=users%2Frhaas%2Fpostgres.git

More work on chash.  Initialization theoretically works now.
---

diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 250e31255e..5f94f57d88 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
 	return structPtr;
 }
 
+/*
+ * ShmemInitStruct -- Attach to an existing structure in shared memory.
+ */
+void *
+ShmemAttachStruct(const char *name)
+{
+	ShmemIndexEnt *result;
+	void	   *ptr;
+	bool		found;
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	result = (ShmemIndexEnt *)
+		hash_search(ShmemIndex, name, HASH_FIND, &found);
+	if (!found || result == NULL)
+		elog(ERROR, "shared memory structure %s not found", name);
+	ptr = result->location;
+	Assert(ptr != NULL);
+
+	LWLockRelease(ShmemIndexLock);
+
+	return ptr;
+}
 
 /*
  * Add two Size values, checking for overflow
diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c
index c5d04e1b12..6f1effd839 100644
--- a/src/backend/utils/hash/chash.c
+++ b/src/backend/utils/hash/chash.c
@@ -3,12 +3,71 @@
  * chash.c
  *	  concurrent hash tables
  *
- * The goal of this module is to implement a hash table that can be
- * searched without any locking at all and updated with minimal locking.
- * While a fully lock-free (or, better still, wait-free) hash table seems
- * very desirable, currently known techniques require memory management
- * techniques that are either very complex or difficult to implement in
- * the context of a fixed-size shared memory segment.
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct".  Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection.  Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below).  This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer.  This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word.  In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity.  In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket.  If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * For good concurrency, it seems essential to avoid locking buckets
+ * while they are being scanned.  Taking even a shared LWLock or similar
+ * still means acquiring and releasing a spinlock, with is both
+ * inefficient in terms of raw cycles and a potential contention point.
+ * Thus, we decree that readers must be able to scan bucket chains without
+ * executing any atomic operations either before, during, or after the
+ * scan.  Writers necessarily require some locking; for now, each bucket
+ * has a separate spinlock which must be taken to modify that bucket chain,
+ * but not when reading it.  In the future, we might further adapt this
+ * code to instead use compare-and-swap where available.
+ *
+ * Even after an entity has been deleted from a bucket chain, it is still
+ * possible that some other backend holds a pointer to it from a bucket
+ * chain traversal which began before the deletion was carried out.
+ * Thus, we cannot recycle the block of memory used by an entity for a
+ * new and unrelated entity until we can guarantee that no private
+ * references to it remain.  Instead, we add the entity to one of several
+ * "garbage lists" of items removed from bucket chains that are not yet
+ * known to be recyclable.  Periodically, we move items from garbage lists
+ * to free lists from which they can be reallocated.  This is accomplished
+ * by having each backend which wishes to scan a bucket store the hash
+ * table id and bucket identifier in a per-backend slot in shared memory
+ * before it begins scanning the bucket and clear the value only after it
+ * finishes scanning the bucket, so that it is possible for another
+ * backend to wait (by spinning) for all backends in the process of
+ * scanning a bucket to finish doing so.  To make sure we don't need to
+ * garbage-collect too often, we allocate a slightly larger arena than
+ * the caller's stated maximum size.
  *
  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -21,21 +80,15 @@
 
 #include "postgres.h"
 
+#include "miscadmin.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/chash.h"
+#include "utils/memutils.h"
 
 /*
- * The memory needed to store the entries in a hash table is preallocated in
- * a single chunk called the arena.  We refer to entries using a CHashPtr
- * rather than an ordinary pointer.  One bit of each CHashPtr is reserved for
- * use as a "mark" bit, which is used to implement concurrent deletion.
- * The remaining bits form an offset into the arena.  By storing offset
- * rather than pointers, we can reduce the memory footprint of the hash table
- * considerably, at the cost of limiting the maximum number of elements in a
- * single concurrent hash table to 2^31.  That limitation appears acceptable
- * for now, and we can always switch to pointers or 64-bit integers here in
- * the future, if needed.
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
  */
 typedef uint32 CHashPtr;
 #define InvalidCHashPtr             ((uint32) -1)
@@ -45,47 +98,31 @@ typedef uint32 CHashPtr;
 #define CHashPtrMark(x)             ((x) | 1)
 #define CHashPtrUnmark(x)           ((x) & ~1)
 #define MakeCHashPtr(x)				((x) << 1)
-
-static uint32 CHashMaxCapacity = CHashPtrGetOffset(InvalidCHashPtr);
+#define CHashMaxCapacity			CHashPtrGetOffset(InvalidCHashPtr)
 
 /*
- * Each hash bucket is implemented as a pointer to the first item in the
- * bucket, or InvalidCHashPtr if the bucket is empty.  Each item contains a
- * pointer to the next item in the bucket, or InvalidCHashPtr if there are no
- * more items.
- *
- * Each bucket also has a spinlock which is used to serialize modifications
- * to the bucket, but need not be taken when searching it.
+ * CHashBucket represents a single hash bucket, garbage list, or free list.
  */
 typedef struct
 {
-	CHashPtr	head;		/* arena offset of first element in bucket */
-	slock_t		mutex;		/* mutual exclusion for modifications */
+	CHashPtr	head;				/* arena offset of bucket head */
+	slock_t		mutex;				/* mutual exclusion for changes */
 } CHashBucket;
 
 /*
- * Each free list is implemented as a pointer to the first item on the
- * free list, or InvalidCHashPtr if the free list is empty.  Each free list
- * is protected by a spinlock.
- */
-typedef struct
-{
-	CHashPtr	head;		/* arena offset of first element in bucket */
-	slock_t		mutex;		/* mutual exclusion for modifications */
-} CHashFreeList;
-
-/*
- * Each item stored in the hash table is represented by a CHashNode, which
+ * Each object stored in the hash table is represented by a CHashNode, which
  * stores a pointer to the next item in the same bucket, and the exact hash
  * value of the current item.  Each CHashNode is followed by space for the
  * item itself.
  */
 typedef struct
 {
-	CHashPtr	next;		/* arena offset of next element in bucket */
-	uint32		hash_value;	/* hash(key) */
+	CHashPtr	next;				/* arena offset of next element */
+	uint32		hash_value;			/* hash(key) */
 } CHashNode;
-#define CHashNodeGetItem(x)		((void *) (((char *) x) + sizeof(CHashNode)))
+
+#define SizeOfCHashNode		MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x)	((void *) (((char *) x) + SizeOfCHashNode))
 
 /*
  * CHashTableData stores all the information that we need in order to access
@@ -94,29 +131,41 @@ typedef struct
  * table.  None of this information changes after the initial setup of the
  * hash table.
  */
-typedef struct
+typedef struct CHashTableData
 {
 	CHashDescriptor	desc;			/* descriptor for this hash table */
 	uint32			nbuckets;		/* # of buckets; must be a power of two */
 	uint32			bucket_mask;	/* # of buckets, minus one */
-	uint32			nfreelists;		/* # of freelists, also a power of two */
-	void		   *arena;			/* arena */
+	uint32			garbage_shift;	/* log2(nbuckets/ngarbage) */
+	uint32			ngarbage;		/* # of garbage lists, a power of two */
+	uint32			nfreelists;		/* # of freelists */
+	uint32			arena_limit;	/* # of arena elements */
+	uint32			arena_stride;	/* bytes allocated per arena element */
 	CHashBucket	   *bucket;			/* array of size nbuckets */
-	CHashFreeList  *freelist;		/* array of size nfreelists */
+	CHashBucket	   *garbage;		/* array of size ngarbage */
+	CHashBucket	   *freelist;		/* array of size nfreelists */
+	void		   *arena;			/* arena */
 } CHashTableData;
 
 /*
- * Compute the number of buckets and the number of freelists for a hash table
- * with a given capacity.
+ * First stage of CHashTable initialization.  We fill in all the constants
+ * here, but not the pointers.
  */
-static void
-CHashSizingParameters(uint32 capacity, uint32 *nbuckets, uint32 *nfreelists)
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
 {
-	uint32	bucket_shift;
-	uint32	freelist_shift;
+	CHashTable		table;
+	uint32			bucket_shift;
 
-	if (capacity < 1 || capacity > CHashMaxCapacity)
+	/* Allocate table and copy descriptor. */
+	table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+	memcpy(&table->desc, desc, sizeof(CHashDescriptor)); 
+
+	/* Sanity checks. */
+	if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
 		elog(ERROR, "invalid capacity for concurrent hash");
+	if (desc->key_size < 1 || desc->key_size > desc->element_size)
+		elog(ERROR, "invalid key size for concurrent hash");
 
  	/*
 	 * The number of buckets must be a power of two.  To avoid (as much as
@@ -124,28 +173,116 @@ CHashSizingParameters(uint32 capacity, uint32 *nbuckets, uint32 *nfreelists)
 	 * factor <= 1.0, so this is a pretty simple calculation: we just find the
 	 * smallest power of two greater than or equal to the target capacity.
 	 */
-	bucket_shift = fls(capacity) - 1;
-	*nbuckets = 1 << bucket_shift;
+	bucket_shift = fls(desc->capacity) - 1;
+	table->nbuckets = 1 << bucket_shift;
+	table->bucket_mask = table->nbuckets - 1;
 
 	/*
-	 * The number of freelists must also be a power of two, and must be no
-	 * larger than the number of buckets.
+	 * It's not exactly clear how to determine the optimal number of garbage
+	 * lists.  If there are too few, then garbage collection will have to wait
+	 * behind concurrent scans excessively frequently.  But if there are too
+	 * many, then garbage collection won't recover very many items.
 	 */
-	freelist_shift = bucket_shift / 2;
-	*nfreelists = 1 << freelist_shift;
+	table->garbage_shift = Min(bucket_shift, 6);
+	table->ngarbage = table->nbuckets >> table->garbage_shift;
+
+	/*
+	 * The number of freelists must be large enough to avoid contention;
+	 * having extras is fairly harmless.  But there seems to be no point in
+	 * having more free lists than garbage lists; if the garbage lists aren't
+	 * causing contention, an equal number of free lists shouldn't either.
+	 */
+	table->nfreelists = Min(table->ngarbage, 16);
+
+	/*
+	 * To make garbage collection efficient, we overallocate.  Normally, we
+	 * overallocate by one-eighth, but if that would be less than 15 elements,
+	 * then we allocate 15 elements instead.  This extra capacity can actually
+	 * be used, but for best performance, it shouldn't be.  It's the caller's
+	 * responsibility to avoid this where relevant.
+	 */
+	table->arena_limit = desc->capacity;
+	if (desc->capacity < 120)
+		table->arena_limit += 15;
+	else
+		table->arena_limit += table->arena_limit / 8;
+
+	/* Each arena element must be MAXALIGN'd and include per-node space. */
+	table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+	return table;
 }
 
+/*
+ * Estimate shared memory requirements.
+ */
 Size
-CHashEstimateSize(CHashDescriptor *desc)
+CHashEstimateSize(CHashTable table)
 {
-	uint32	nbuckets,
-			nfreelists;
-	Size	size;
+	Size		size;
+	Size		total_buckets;
 
-	CHashSizingParameters(desc->capacity, &nbuckets, &nfreelists);
+	total_buckets = add_size(table->nbuckets, table->ngarbage);
+	total_buckets = add_size(total_buckets, table->nfreelists);
 
 	size = MAXALIGN(sizeof(CHashTableData));
-	size = add_size(size, mul_size(MAXALIGN(sizeof(CHashBucket)), nbuckets));
+	size = add_size(size, mul_size(sizeof(CHashBucket), total_buckets));
+	size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
 
 	return size;
 }
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+	Size	size;
+	bool	found;
+	void   *shmem;
+
+	/*
+	 * If we're under the postmaster, this must be the EXEC_BACKEND case where
+	 * we need to attach to an existing shared-memory segment.
+	 */
+	if (IsUnderPostmaster)
+	{
+		void   *shmem;
+
+		Assert(table == NULL);
+		table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+		shmem = ShmemAttachStruct(desc->shmem_name);
+		memcpy(table, shmem, sizeof(CHashTableData));
+		return table;
+	}
+
+	/*
+	 * Otherwise, the hash table should not already exist, and we must
+	 * create it.  But the table should already be bootstrapped, since we
+	 * must previously have computed its size when figuring out our shared
+	 * memory allocation.
+	 */
+	Assert(table != NULL);
+	size = CHashEstimateSize(table);
+	shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+	Assert(!found);
+
+	/* Bucket, garbage, and freelist arrays follow table info. */
+	table->bucket = (CHashBucket *)
+		(((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+	table->garbage = &table->bucket[table->nbuckets];
+	table->freelist = &table->garbage[table->ngarbage];
+
+	/* Arena follows the various lists. */
+	table->arena = (void *) (&table->freelist[table->nfreelists]);
+
+	/*
+	 * Copy table (with pointers now filled in) to shared memory.  This is
+	 * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+	 */
+	memcpy(shmem, table, sizeof(CHashTableData));
+
+	return table;
+}
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index c94d620097..855b65ef43 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -40,6 +40,7 @@ extern void InitShmemIndex(void);
 extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
 			  HASHCTL *infoP, int hash_flags);
 extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemAttachStruct(const char *name);
 extern Size add_size(Size s1, Size s2);
 extern Size mul_size(Size s1, Size s2);
 
diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h
index c0adfd9a5f..e17a45aca4 100644
--- a/src/include/utils/chash.h
+++ b/src/include/utils/chash.h
@@ -13,14 +13,10 @@
 #ifndef CHASH_H 
 #define CHASH_H
 
-/*
- * A concurrent hash table stores a bounded number of fixed-size elements,
- * each of which begins with a fixed-size key.  This structure provides just
- * enough information about a proposed concurrent hash table to estimate its
- * size, or create it.
- */
+/* Everything caller must supply to set up a concurrent hash table. */
 typedef struct
 {
+	const char *shmem_name;		/* shared memory name for this hash table */
 	uint32		id;				/* unique identifier for this hash table */
 	uint32		capacity;		/* maximum size of hash table */
 	uint16		element_size;	/* size of each element */
@@ -32,9 +28,9 @@ struct CHashTableData;
 typedef struct CHashTableData *CHashTable;
 
 /* Initialization functions. */
-extern Size CHashEstimateSize(CHashDescriptor *desc);
-extern CHashTable CHashInitialize(CHashDescriptor *desc);
-extern CHashTable CHashAttach(CHashDescriptor *desc);
+extern CHashTable CHashBootstrap(CHashDescriptor *desc);
+extern Size CHashEstimateSize(CHashTable table);
+extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc);
 
 /* Accessor functions. */
 extern bool CHashInsert(CHashTable table, void *entry);