From: Tomas Vondra Date: Mon, 16 Jan 2017 18:18:05 +0000 (+0100) Subject: temporarily merge all tuplesort changes from REL9_6_STABLE X-Git-Tag: XL_10_R1BETA1~454 X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=5591450d5a3de7c6c79b209d5d3a2512b85930c4;p=postgres-xl.git temporarily merge all tuplesort changes from REL9_6_STABLE --- diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 062f542b54..3c16282b30 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -13,26 +13,26 @@ * See Knuth, volume 3, for more than you want to know about the external * sorting algorithm. Historically, we divided the input into sorted runs * using replacement selection, in the form of a priority tree implemented - * as a heap (essentially his Algorithm 5.2.3H -- although that strategy is - * often avoided altogether), but that can now only happen first the first - * run. We merge the runs using polyphase merge, Knuth's Algorithm + * as a heap (essentially his Algorithm 5.2.3H), but now we only do that + * for the first run, and only if the run would otherwise end up being very + * short. We merge the runs using polyphase merge, Knuth's Algorithm * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by * logtape.c, which avoids space wastage by recycling disk space as soon * as each block is read from its "tape". * - * We never form the initial runs using Knuth's recommended replacement - * selection data structure (Algorithm 5.4.1R), because it uses a fixed - * number of records in memory at all times. Since we are dealing with - * tuples that may vary considerably in size, we want to be able to vary - * the number of records kept in memory to ensure full utilization of the - * allowed sort memory space. So, we keep the tuples in a variable-size - * heap, with the next record to go out at the top of the heap. Like - * Algorithm 5.4.1R, each record is stored with the run number that it - * must go into, and we use (run number, key) as the ordering key for the - * heap. When the run number at the top of the heap changes, we know that - * no more records of the prior run are left in the heap. Note that there - * are in practice only ever two distinct run numbers, due to the greatly - * reduced use of replacement selection in PostgreSQL 9.6. + * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for + * the replacement selection, because it uses a fixed number of records + * in memory at all times. Since we are dealing with tuples that may vary + * considerably in size, we want to be able to vary the number of records + * kept in memory to ensure full utilization of the allowed sort memory + * space. So, we keep the tuples in a variable-size heap, with the next + * record to go out at the top of the heap. Like Algorithm 5.4.1R, each + * record is stored with the run number that it must go into, and we use + * (run number, key) as the ordering key for the heap. When the run number + * at the top of the heap changes, we know that no more records of the prior + * run are left in the heap. Note that there are in practice only ever two + * distinct run numbers, because since PostgreSQL 9.6, we only use + * replacement selection to form the first run. * * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small * customizations) is only used with the aim of producing just one run, @@ -148,9 +148,9 @@ /* sort-type codes for sort__start probes */ -#define HEAP_SORT 0 -#define INDEX_SORT 1 -#define DATUM_SORT 2 +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 #define CLUSTER_SORT 3 #ifdef PGXC #define MERGE_SORT 4 @@ -1591,7 +1591,7 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, mtup->datum1 = index_getattr(tuple, 1, RelationGetDescr(state->indexRel), - &stup.isnull1); + &mtup->isnull1); } } @@ -2080,6 +2080,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward, sizeof(unsigned int))) return false; tuplen = getlen(state, state->result_tape, false); + /* * Back up to get ending length word of tuple before it. */ @@ -2196,6 +2197,10 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward, * determination of "non-equal tuple" based on simple binary inequality. A * NULL value in leading attribute will set abbreviated value to zeroed * representation, which caller may rely on in abbreviated inequality check. + * + * The slot receives a copied tuple (sometimes allocated in caller memory + * context) that will stay valid regardless of future manipulations of the + * tuplesort's state. */ bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, @@ -2216,6 +2221,11 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, if (state->sortKeys->abbrev_converter && abbrev) *abbrev = stup.datum1; + if (!should_free) + { + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + should_free = true; + } ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, should_free); return true; } @@ -3017,6 +3027,9 @@ batchmemtuples(Tuplesortstate *state) int64 availMemLessRefund; int memtupsize = state->memtupsize; + /* Caller error if we have no tapes */ + Assert(state->activeTapes > 0); + /* For simplicity, assume no memtuples are actually currently counted */ Assert(state->memtupcount == 0); @@ -3030,6 +3043,20 @@ batchmemtuples(Tuplesortstate *state) refund = memtupsize * STANDARDCHUNKHEADERSIZE; availMemLessRefund = state->availMem - refund; + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the batch allocation size could be calculated as negative, causing + * havoc. Hence, if availMemLessRefund is negative at this point, we must + * do nothing. Moreover, if it's positive but rather small, there's + * little point in proceeding because we could only increase memtuples by + * a small amount, not worth the cost of the repalloc's. We somewhat + * arbitrarily set the threshold at ALLOCSET_DEFAULT_INITSIZE per tape. + * (Note that this does not represent any assumption about tuple sizes.) + */ + if (availMemLessRefund <= + (int64) state->activeTapes * ALLOCSET_DEFAULT_INITSIZE) + return; + /* * To establish balanced memory use after refunding palloc overhead, * temporarily have our accounting indicate that we've allocated all @@ -3039,9 +3066,11 @@ batchmemtuples(Tuplesortstate *state) state->growmemtuples = true; USEMEM(state, availMemLessRefund); (void) grow_memtuples(state); - /* Should not matter, but be tidy */ - FREEMEM(state, availMemLessRefund); state->growmemtuples = false; + /* availMem must stay accurate for spacePerTape calculation */ + FREEMEM(state, availMemLessRefund); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); #ifdef TRACE_SORT if (trace_sort) @@ -4036,6 +4065,7 @@ readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen) } } + /* * Routines specialized for HeapTuple (actually MinimalTuple) case */ @@ -4469,7 +4499,7 @@ copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) mtup->datum1 = heap_getattr(tuple, state->indexInfo->ii_KeyAttrNumbers[0], state->tupDesc, - &stup->isnull1); + &mtup->isnull1); } } } @@ -4790,7 +4820,7 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) mtup->datum1 = index_getattr(tuple, 1, RelationGetDescr(state->indexRel), - &stup->isnull1); + &mtup->isnull1); } } }