Fix performance problems in multi-batch hash joins by ensuring that we select

author Tom Lane <[email protected]>

Fri, 1 Jun 2007 15:58:09 +0000 (15:58 +0000)

committer Tom Lane <[email protected]>

Fri, 1 Jun 2007 15:58:09 +0000 (15:58 +0000)
author Tom Lane <[email protected]>
Fri, 1 Jun 2007 15:58:09 +0000 (15:58 +0000)
committer Tom Lane <[email protected]>
Fri, 1 Jun 2007 15:58:09 +0000 (15:58 +0000)
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c

index d8e5732695e449445442fc483600c2a6a27c6f4d..89507fb8162c16c370253b67794a84b84ad38f5c 100644 (file)
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -267,6 +267,31 @@ hash_any(register const unsigned char *k, register int keylen)
                         /* case 0: nothing left to add */
         }
         mix(a, b, c);
+
+       /* report the result */
+       return UInt32GetDatum(c);
+}
+
+/*
+ * hash_uint32() -- hash a 32-bit value
+ *
+ * This has the same result (at least on little-endian machines) as
+ *             hash_any(&k, sizeof(uint32))
+ * but is faster and doesn't force the caller to store k into memory.
+ */
+Datum
+hash_uint32(uint32 k)
+{
+       register uint32 a,
+                               b,
+                               c;
+
+       a = 0x9e3779b9 + k;
+       b = 0x9e3779b9;
+       c = 3923095 + (uint32) sizeof(uint32);
+
+       mix(a, b, c);
+
         /* report the result */
         return UInt32GetDatum(c);
  }
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c

index d0fe9704d1a9d455e5d8650e640bc343e68e0b1e..40193ad085c26327af2888f46e5b1060b99c85e5 100644 (file)
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -20,6 +20,7 @@
   */
  #include "postgres.h"
  
+#include "access/hash.h"
  #include "executor/execdebug.h"
  #include "executor/hashjoin.h"
  #include "executor/instrument.h"
@@ -713,9 +714,11 @@ ExecHashGetHashValue(HashJoinTable hashtable,
   * chains), and must only cause the batch number to remain the same or
   * increase.  Our algorithm is
   *             bucketno = hashvalue MOD nbuckets
- *             batchno = (hashvalue DIV nbuckets) MOD nbatch
- * where nbuckets should preferably be prime so that all bits of the
- * hash value can affect both bucketno and batchno.
+ *             batchno = hash_uint32(hashvalue) MOD nbatch
+ * which gives reasonably independent bucket and batch numbers in the face
+ * of some rather poorly-implemented hash functions in hashfunc.c.  (This
+ * will change in PG 8.3.)
+ *
   * nbuckets doesn't change over the course of the join.
   *
   * nbatch is always a power of 2; we increase it only by doubling it.  This
@@ -734,7 +737,7 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
         {
                 *bucketno = hashvalue % nbuckets;
                 /* since nbatch is a power of 2, can do MOD by masking */
-               *batchno = (hashvalue / nbuckets) & (nbatch - 1);
+               *batchno = hash_uint32(hashvalue) & (nbatch - 1);
         }
         else
         {
diff --git a/src/include/access/hash.h b/src/include/access/hash.h

index 6f7863bd676a84311a3d9da7e0af86a8b2ecc2f7..1b9781336355a8edab197e3258454100c51f75df 100644 (file)
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -263,6 +263,7 @@ extern Datum hashname(PG_FUNCTION_ARGS);
  extern Datum hashtext(PG_FUNCTION_ARGS);
  extern Datum hashvarlena(PG_FUNCTION_ARGS);
  extern Datum hash_any(register const unsigned char *k, register int keylen);
+extern Datum hash_uint32(uint32 k);
  
  /* private routines */
author	Tom Lane <[email protected]>
	Fri, 1 Jun 2007 15:58:09 +0000 (15:58 +0000)
committer	Tom Lane <[email protected]>
	Fri, 1 Jun 2007 15:58:09 +0000 (15:58 +0000)
src/backend/access/hash/hashfunc.c		patch \| blob \| blame \| history
src/backend/executor/nodeHash.c		patch \| blob \| blame \| history
src/include/access/hash.h		patch \| blob \| blame \| history