From 051185fcbb8acfdfd44af0cafbb7953bed363363 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 31 Aug 2020 11:53:32 -0700 Subject: [PATCH v11] Implement Adaptive Hashjoin If the inner side tuples of a hashjoin will not fit in memory, the hashjoin can be executed in multiple batches. If the statistics on the inner side relation are accurate, planner chooses a multi-batch strategy and sets the number of batches. The query executor measures the real size of the hashtable and increases the number of batches if the hashtable grows too large. The number of batches is always a power of two, so an increase in the number of batches doubles it. Serial hashjoin measures batch size lazily -- waiting until it is loading a batch to determine if it will fit in memory. Parallel hashjoin, on the other hand, completes all changes to the number of batches during the build phase. If it doubles the number of batches, it dumps all the tuples out, reassigns them to batches, measures each batch, and checks that it will fit in the space allowed. In both cases, the executor currently makes a best effort. If a particular batch won't fit in memory, and, upon changing the number of batches none of the tuples move to a new batch, the executor disables growth in the number of batches globally. After growth is disabled, all batches that would have previously triggered an increase in the number of batches instead exceed the space allowed. There is no mechanism to perform a hashjoin within memory constraints if a run of tuples hash to the same batch. Also, hashjoin will continue to double the number of batches if *some* tuples move each time -- even if the batch will never fit in memory -- resulting in an explosion in the number of batches (affecting performance negatively for multiple reasons). Adaptive hashjoin is a mechanism to process a run of inner side tuples with join keys which hash to the same batch in a manner that is efficient and respects the space allowed. When an offending batch causes the number of batches to be doubled and some percentage of the tuples would not move to a new batch, that batch can be marked to "fall back". This mechanism replaces serial hashjoin's "grow_enabled" flag and replaces part of the functionality of parallel hashjoin's "growth = PHJ_GROWTH_DISABLED" flag. However, instead of disabling growth in the number of batches for all batches, it only prevents this batch from causing another increase in the number of batches. When the inner side of this batch is loaded into memory, stripes of arbitrary tuples totaling work_mem in size are loaded into the hashtable. After probing this stripe, the outer side batch is rewound and the next stripe is loaded. Each stripe of inner is probed until all tuples have been processed. Tuples that match are emitted (depending on the join semantics of the particular join type) during probing of a stripe. In order to make left outer join work, unmatched tuples cannot be emitted NULL-extended until all stripes have been probed. To address this, a bitmap is created with a bit for each tuple of the outer side. If a tuple on the outer side matches a tuple from the inner, the corresponding bit is set. At the end of probing all stripes, the executor scans the bitmap and emits unmatched outer tuples. Co-authored-by: Jesse Zhang Co-authored-by: David Kimura Co-authored-by: Soumyadeep Chakraborty --- src/backend/commands/explain.c | 43 +- src/backend/executor/nodeHash.c | 749 ++++++-- src/backend/executor/nodeHashjoin.c | 794 ++++++-- src/backend/postmaster/pgstat.c | 31 +- src/backend/utils/sort/Makefile | 1 + src/backend/utils/sort/sharedbits.c | 288 +++ src/backend/utils/sort/sharedtuplestore.c | 96 +- src/include/commands/explain.h | 1 + src/include/executor/hashjoin.h | 132 +- src/include/executor/instrument.h | 7 + src/include/executor/nodeHash.h | 9 +- src/include/executor/tuptable.h | 2 + src/include/nodes/execnodes.h | 5 + src/include/pgstat.h | 11 +- src/include/utils/sharedbits.h | 39 + src/include/utils/sharedtuplestore.h | 21 + src/test/regress/expected/join_hash.out | 2024 ++++++++++++++++++++- src/test/regress/sql/join_hash.sql | 214 ++- 18 files changed, 4173 insertions(+), 294 deletions(-) create mode 100644 src/backend/utils/sort/sharedbits.c create mode 100644 src/include/utils/sharedbits.h diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index c98c9b5547..1ce37dc4e2 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -185,6 +185,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt, es->wal = defGetBoolean(opt); else if (strcmp(opt->defname, "settings") == 0) es->settings = defGetBoolean(opt); + else if (strcmp(opt->defname, "usage") == 0) + es->usage = defGetBoolean(opt); else if (strcmp(opt->defname, "timing") == 0) { timing_set = true; @@ -308,6 +310,7 @@ NewExplainState(void) /* Set default options (most fields can be left as zeroes). */ es->costs = true; + es->usage = true; /* Prepare output buffer. */ es->str = makeStringInfo(); @@ -3011,22 +3014,50 @@ show_hash_info(HashState *hashstate, ExplainState *es) else if (hinstrument.nbatch_original != hinstrument.nbatch || hinstrument.nbuckets_original != hinstrument.nbuckets) { + ListCell *lc; + ExplainIndentText(es); appendStringInfo(es->str, - "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n", + "Buckets: %d (originally %d) Batches: %d (originally %d)", hinstrument.nbuckets, hinstrument.nbuckets_original, hinstrument.nbatch, - hinstrument.nbatch_original, - spacePeakKb); + hinstrument.nbatch_original); + if (es->usage) + appendStringInfo(es->str, " Memory Usage: %ldkB\n", spacePeakKb); + else + appendStringInfo(es->str, "\n"); + + foreach(lc, hinstrument.fallback_batches_stats) + { + FallbackBatchStats *fbs = lfirst(lc); + + ExplainIndentText(es); + appendStringInfo(es->str, "Batch: %d Stripes: %d\n", fbs->batchno, fbs->numstripes); + } } else { + ListCell *lc; + ExplainIndentText(es); appendStringInfo(es->str, - "Buckets: %d Batches: %d Memory Usage: %ldkB\n", - hinstrument.nbuckets, hinstrument.nbatch, - spacePeakKb); + "Buckets: %d Batches: %d", + hinstrument.nbuckets, hinstrument.nbatch); + if (es->usage) + appendStringInfo(es->str, " Memory Usage: %ldkB\n", spacePeakKb); + else + appendStringInfo(es->str, "\n"); + foreach(lc, hinstrument.fallback_batches_stats) + { + FallbackBatchStats *fbs = lfirst(lc); + + ExplainIndentText(es); + appendStringInfo(es->str, + "Batch: %d Stripes: %d\n", + fbs->batchno, + fbs->numstripes); + } } } } diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index ea69eeb2a1..8a62c0d2dd 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -60,6 +60,7 @@ static void *dense_alloc(HashJoinTable hashtable, Size size); static HashJoinTuple ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, dsa_pointer *shared); +static void ExecParallelHashTableEvictBatch0(HashJoinTable hashtable); static void MultiExecPrivateHash(HashState *node); static void MultiExecParallelHash(HashState *node); static inline HashJoinTuple ExecParallelHashFirstTuple(HashJoinTable table, @@ -72,6 +73,9 @@ static inline void ExecParallelHashPushTuple(dsa_pointer_atomic *head, static void ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch); static void ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable); static void ExecParallelHashRepartitionFirst(HashJoinTable hashtable); +static void ExecParallelHashRepartitionBatch0Tuple(HashJoinTable hashtable, + MinimalTuple tuple, + uint32 hashvalue); static void ExecParallelHashRepartitionRest(HashJoinTable hashtable); static HashMemoryChunk ExecParallelHashPopChunkQueue(HashJoinTable table, dsa_pointer *shared); @@ -184,13 +188,53 @@ MultiExecPrivateHash(HashState *node) } else { - /* Not subject to skew optimization, so insert normally */ - ExecHashTableInsert(hashtable, slot, hashvalue); + /* + * Not subject to skew optimization, so either insert normally + * or save to batch file if batch 0 falls back and we have + * already filled the hashtable up to space_allowed. + */ + int bucketno; + int batchno; + bool shouldFree; + MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &bucketno, &batchno); + + /* + * If we set batch 0 to fallback on the previous tuple Save + * the tuples in this batch which will not fit in the + * hashtable should I be checking that hashtable->curstripe != + * 0? + */ + if (hashtable->hashloopBatchFile && hashtable->hashloopBatchFile[0]) + ExecHashJoinSaveTuple(tuple, + hashvalue, + &hashtable->innerBatchFile[batchno]); + else + ExecHashTableInsert(hashtable, slot, hashvalue); + + if (shouldFree) + heap_free_minimal_tuple(tuple); } hashtable->totalTuples += 1; } } + /* + * If batch 0 fell back, rewind the inner side file where we saved the + * tuples which did not fit in memory to prepare it for loading upon + * finishing probing stripe 0 of batch 0 + */ + if (hashtable->innerBatchFile && hashtable->innerBatchFile[0]) + { + if (BufFileSeek(hashtable->innerBatchFile[0], 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + } + + /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */ if (hashtable->nbuckets != hashtable->nbuckets_optimal) ExecHashIncreaseNumBuckets(hashtable); @@ -319,9 +363,9 @@ MultiExecParallelHash(HashState *node) * are now fixed. While building them we made sure they'd fit * in our memory budget when we load them back in later (or we * tried to do that and gave up because we detected extreme - * skew). + * skew and thus marked them to fall back). */ - pstate->growth = PHJ_GROWTH_DISABLED; + pstate->growth = PHJ_GROWTH_LOADING; } } @@ -496,12 +540,14 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, hashtable->curbatch = 0; hashtable->nbatch_original = nbatch; hashtable->nbatch_outstart = nbatch; - hashtable->growEnabled = true; hashtable->totalTuples = 0; hashtable->partialTuples = 0; hashtable->skewTuples = 0; hashtable->innerBatchFile = NULL; hashtable->outerBatchFile = NULL; + hashtable->hashloopBatchFile = NULL; + hashtable->fallback_batches_stats = NULL; + hashtable->curstripe = STRIPE_DETACHED; hashtable->spaceUsed = 0; hashtable->spacePeak = 0; hashtable->spaceAllowed = space_allowed; @@ -573,6 +619,8 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, palloc0(nbatch * sizeof(BufFile *)); hashtable->outerBatchFile = (BufFile **) palloc0(nbatch * sizeof(BufFile *)); + hashtable->hashloopBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); /* The files will not be opened until needed... */ /* ... but make sure we have temp tablespaces established for them */ PrepareTempTablespaces(); @@ -856,18 +904,19 @@ ExecHashTableDestroy(HashJoinTable hashtable) int i; /* - * Make sure all the temp files are closed. We skip batch 0, since it - * can't have any temp files (and the arrays might not even exist if - * nbatch is only 1). Parallel hash joins don't use these files. + * Make sure all the temp files are closed. Parallel hash joins don't use + * these files. */ if (hashtable->innerBatchFile != NULL) { - for (i = 1; i < hashtable->nbatch; i++) + for (i = 0; i < hashtable->nbatch; i++) { if (hashtable->innerBatchFile[i]) BufFileClose(hashtable->innerBatchFile[i]); if (hashtable->outerBatchFile[i]) BufFileClose(hashtable->outerBatchFile[i]); + if (hashtable->hashloopBatchFile[i]) + BufFileClose(hashtable->hashloopBatchFile[i]); } } @@ -878,6 +927,18 @@ ExecHashTableDestroy(HashJoinTable hashtable) pfree(hashtable); } +/* + * Threshhold for tuple relocation during batch split for parallel and serial + * hashjoin. + * While growing the number of batches, for the batch which triggered the growth, + * if more than MAX_RELOCATION % of its tuples move to its child batch, then + * it likely has skewed data and so the child batch (the new home to the skewed + * tuples) will be marked as a "fallback" batch and processed using the hashloop + * join algorithm. The reverse is true as well: if more than MAX_RELOCATION + * remain in the parent, it too should be marked to "fallback". + */ +#define MAX_RELOCATION 0.8 + /* * ExecHashIncreaseNumBatches * increase the original number of batches in order to reduce @@ -888,14 +949,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) { int oldnbatch = hashtable->nbatch; int curbatch = hashtable->curbatch; + int childbatch; int nbatch; MemoryContext oldcxt; long ninmemory; long nfreed; HashMemoryChunk oldchunks; + int curbatch_outgoing_tuples; + int childbatch_outgoing_tuples; + int target_batch; + FallbackBatchStats *fallback_batch_stats; + size_t batchSize = 0; - /* do nothing if we've decided to shut off growth */ - if (!hashtable->growEnabled) + if (hashtable->hashloopBatchFile && hashtable->hashloopBatchFile[curbatch]) return; /* safety check to avoid overflow */ @@ -919,6 +985,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) palloc0(nbatch * sizeof(BufFile *)); hashtable->outerBatchFile = (BufFile **) palloc0(nbatch * sizeof(BufFile *)); + hashtable->hashloopBatchFile = (BufFile **) + palloc0(nbatch * sizeof(BufFile *)); /* time to establish the temp tablespaces, too */ PrepareTempTablespaces(); } @@ -929,10 +997,14 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *)); hashtable->outerBatchFile = (BufFile **) repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *)); + hashtable->hashloopBatchFile = (BufFile **) + repalloc(hashtable->hashloopBatchFile, nbatch * sizeof(BufFile *)); MemSet(hashtable->innerBatchFile + oldnbatch, 0, (nbatch - oldnbatch) * sizeof(BufFile *)); MemSet(hashtable->outerBatchFile + oldnbatch, 0, (nbatch - oldnbatch) * sizeof(BufFile *)); + MemSet(hashtable->hashloopBatchFile + oldnbatch, 0, + (nbatch - oldnbatch) * sizeof(BufFile *)); } MemoryContextSwitchTo(oldcxt); @@ -944,6 +1016,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) * no longer of the current batch. */ ninmemory = nfreed = 0; + curbatch_outgoing_tuples = childbatch_outgoing_tuples = 0; + childbatch = (1U << (my_log2(hashtable->nbatch) - 1)) | hashtable->curbatch; /* If know we need to resize nbuckets, we can do it while rebatching. */ if (hashtable->nbuckets_optimal != hashtable->nbuckets) @@ -990,7 +1064,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, &bucketno, &batchno); - if (batchno == curbatch) + if (batchno == curbatch && (curbatch != 0 || batchSize + hashTupleSize < hashtable->spaceAllowed)) { /* keep tuple in memory - copy it into the new chunk */ HashJoinTuple copyTuple; @@ -1001,17 +1075,29 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) /* and add it back to the appropriate bucket */ copyTuple->next.unshared = hashtable->buckets.unshared[bucketno]; hashtable->buckets.unshared[bucketno] = copyTuple; + curbatch_outgoing_tuples++; + batchSize += hashTupleSize; } else { /* dump it out */ - Assert(batchno > curbatch); + Assert(batchno > curbatch || batchSize + hashTupleSize >= hashtable->spaceAllowed); ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple), hashTuple->hashvalue, &hashtable->innerBatchFile[batchno]); hashtable->spaceUsed -= hashTupleSize; nfreed++; + + /* + * TODO: what to do about tuples that don't go to the child + * batch or stay in the current batch? (this is why we are + * counting tuples to child and curbatch with two diff + * variables in case the tuples go to a batch that isn't the + * child) + */ + if (batchno == childbatch) + childbatch_outgoing_tuples++; } /* next tuple in this chunk */ @@ -1032,21 +1118,33 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) #endif /* - * If we dumped out either all or none of the tuples in the table, disable - * further expansion of nbatch. This situation implies that we have - * enough tuples of identical hashvalues to overflow spaceAllowed. - * Increasing nbatch will not fix it since there's no way to subdivide the - * group any more finely. We have to just gut it out and hope the server - * has enough RAM. + * The same batch should not be marked to fall back more than once */ - if (nfreed == 0 || nfreed == ninmemory) - { - hashtable->growEnabled = false; #ifdef HJDEBUG - printf("Hashjoin %p: disabling further increase of nbatch\n", - hashtable); + if ((childbatch_outgoing_tuples / (float) ninmemory) >= 0.8) + printf("childbatch %i targeted to fallback.", childbatch); + if ((curbatch_outgoing_tuples / (float) ninmemory) >= 0.8) + printf("curbatch %i targeted to fallback.", curbatch); #endif - } + + /* + * If too many tuples remain in the parent or too many tuples migrate to + * the child, there is likely skew and continuing to increase the number + * of batches will not help. Mark the batch which contains the skewed + * tuples to be processed with block nested hashloop join. + */ + if ((childbatch_outgoing_tuples / (float) ninmemory) >= MAX_RELOCATION) + target_batch = childbatch; + else if ((curbatch_outgoing_tuples / (float) ninmemory) >= MAX_RELOCATION) + target_batch = curbatch; + else + return; + hashtable->hashloopBatchFile[target_batch] = BufFileCreateTemp(false); + + fallback_batch_stats = palloc0(sizeof(FallbackBatchStats)); + fallback_batch_stats->batchno = target_batch; + fallback_batch_stats->numstripes = 0; + hashtable->fallback_batches_stats = lappend(hashtable->fallback_batches_stats, fallback_batch_stats); } /* @@ -1199,6 +1297,11 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) ExecParallelHashTableSetCurrentBatch(hashtable, 0); /* Then partition, flush counters. */ ExecParallelHashRepartitionFirst(hashtable); + + /* + * TODO: add a debugging check that confirms that all the tuples + * from the old generation are present in the new generation + */ ExecParallelHashRepartitionRest(hashtable); ExecParallelHashMergeCounters(hashtable); /* Wait for the above to be finished. */ @@ -1217,7 +1320,6 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) WAIT_EVENT_HASH_GROW_BATCHES_DECIDE)) { bool space_exhausted = false; - bool extreme_skew_detected = false; /* Make sure that we have the current dimensions and buckets. */ ExecParallelHashEnsureBatchAccessors(hashtable); @@ -1228,27 +1330,83 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) { ParallelHashJoinBatch *batch = hashtable->batches[i].shared; + /* + * All batches were just created anew during + * repartitioning + */ + Assert(!hashtable->batches[i].shared->hashloop_fallback); + + /* + * At the time of repartitioning, each batch updates its + * estimated_size to reflect the size of the batch file on + * disk. It is also updated when increasing preallocated + * space in ExecParallelHashTuplePrealloc(). + * + * Batch 0 is inserted into memory during the build stage, + * it can spill to a file, so the size member, which + * reflects the part of batch 0 in memory should never + * exceed the space_allowed. + */ + Assert(batch->size <= pstate->space_allowed); + if (batch->space_exhausted || batch->estimated_size > pstate->space_allowed) { int parent; + float frac_moved; space_exhausted = true; + parent = i % pstate->old_nbatch; + frac_moved = batch->ntuples / (float) hashtable->batches[parent].shared->old_ntuples; + /* - * Did this batch receive ALL of the tuples from its - * parent batch? That would indicate that further - * repartitioning isn't going to help (the hash values - * are probably all the same). + * If too many tuples remain in the parent or too many + * tuples migrate to the child, there is likely skew + * and continuing to increase the number of batches + * will not help. Mark the batch which contains the + * skewed tuples to be processed with block nested + * hashloop join. */ - parent = i % pstate->old_nbatch; - if (batch->ntuples == hashtable->batches[parent].shared->old_ntuples) - extreme_skew_detected = true; + if (frac_moved >= MAX_RELOCATION) + { + batch->hashloop_fallback = true; + space_exhausted = false; + } } + + /* + * If all of the tuples in the hashtable were put back in + * the hashtable during repartitioning, mark this batch as + * a fallback batch so that we will evict the tuples to a + * spill file were we to run out of space again This has + * the problem of wasting a lot of time during the probe + * phase if it turns out that we never try and allocate + * any more memory in the hashtable. + * + * TODO: It might be worth doing something to indicate + * that if all of the tuples went back into a batch but it + * only exactly used the space_allowed, that the batch is + * not a fallback batch yet but that the current stripe is + * full, so if you need to allocate more, it would mark it + * as a fallback batch. Otherwise, a batch 0 with no + * tuples in spill files will still be treated as a + * fallback batch during probing + */ + if (i == 0 && hashtable->batches[0].shared->size == pstate->space_allowed) + { + if (hashtable->batches[0].shared->ntuples == hashtable->batches[0].shared->old_ntuples) + { + hashtable->batches[0].shared->hashloop_fallback = true; + space_exhausted = false; + } + } + if (space_exhausted) + break; } - /* Don't keep growing if it's not helping or we'd overflow. */ - if (extreme_skew_detected || hashtable->nbatch >= INT_MAX / 2) + /* Don't keep growing if we'd overflow. */ + if (hashtable->nbatch >= INT_MAX / 2) pstate->growth = PHJ_GROWTH_DISABLED; else if (space_exhausted) pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; @@ -1276,65 +1434,153 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) static void ExecParallelHashRepartitionFirst(HashJoinTable hashtable) { + ParallelHashJoinState *pstate; + + ParallelHashJoinBatch *old_shared; + SharedTuplestoreAccessor *old_inner_batch0_sts; + dsa_pointer chunk_shared; HashMemoryChunk chunk; - Assert(hashtable->nbatch == hashtable->parallel_state->nbatch); + ParallelHashJoinBatch *old_batches = (ParallelHashJoinBatch *) dsa_get_address(hashtable->area, hashtable->parallel_state->old_batches); + + Assert(old_batches); + old_shared = NthParallelHashJoinBatch(old_batches, 0); + old_inner_batch0_sts = sts_attach(ParallelHashJoinBatchInner(old_shared), ParallelWorkerNumber + 1, &hashtable->parallel_state->fileset); + + pstate = hashtable->parallel_state; - while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared))) + Assert(hashtable->nbatch == hashtable->parallel_state->nbatch); + BarrierAttach(&pstate->repartition_barrier); + switch (PHJ_REPARTITION_BATCH0_PHASE(BarrierPhase(&pstate->repartition_barrier))) { - size_t idx = 0; + case PHJ_REPARTITION_BATCH0_DRAIN_QUEUE: + while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared))) + { + MinimalTuple tuple; + size_t idx = 0; - /* Repartition all tuples in this chunk. */ - while (idx < chunk->used) - { - HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx); - MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple); - HashJoinTuple copyTuple; - dsa_pointer shared; - int bucketno; - int batchno; + /* + * Repartition all tuples in this chunk. These tuples may be + * relocated to a batch file or may be inserted back into + * memory. + */ + while (idx < chunk->used) + { + HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx); - ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, - &bucketno, &batchno); + tuple = HJTUPLE_MINTUPLE(hashTuple); - Assert(batchno < hashtable->nbatch); - if (batchno == 0) - { - /* It still belongs in batch 0. Copy to a new chunk. */ - copyTuple = - ExecParallelHashTupleAlloc(hashtable, - HJTUPLE_OVERHEAD + tuple->t_len, - &shared); - copyTuple->hashvalue = hashTuple->hashvalue; - memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len); - ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], - copyTuple, shared); + ExecParallelHashRepartitionBatch0Tuple(hashtable, + tuple, + hashTuple->hashvalue); + + idx += MAXALIGN(HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(hashTuple)->t_len); + } + + dsa_free(hashtable->area, chunk_shared); + CHECK_FOR_INTERRUPTS(); } - else + BarrierArriveAndWait(&pstate->repartition_barrier, WAIT_EVENT_HASH_REPARTITION_BATCH0_DRAIN_QUEUE); + /* FALLTHROUGH */ + case PHJ_REPARTITION_BATCH0_DRAIN_SPILL_FILE: { - size_t tuple_size = - MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); + MinimalTuple tuple; + tupleMetadata metadata; - /* It belongs in a later batch. */ - hashtable->batches[batchno].estimated_size += tuple_size; - sts_puttuple(hashtable->batches[batchno].inner_tuples, - &hashTuple->hashvalue, tuple); + /* + * Repartition all of the tuples in this spill file. These + * tuples may go back into the hashtable if space was freed up + * or they may go into another batch or they may go into the + * batch 0 spill file. + */ + sts_begin_parallel_scan(old_inner_batch0_sts); + while ((tuple = sts_parallel_scan_next(old_inner_batch0_sts, + &metadata.hashvalue))) + { + + ExecParallelHashRepartitionBatch0Tuple(hashtable, + tuple, + metadata.hashvalue); + } + sts_end_parallel_scan(old_inner_batch0_sts); } + } + BarrierArriveAndDetach(&pstate->repartition_barrier); +} - /* Count this tuple. */ - ++hashtable->batches[0].old_ntuples; - ++hashtable->batches[batchno].ntuples; +static void +ExecParallelHashRepartitionBatch0Tuple(HashJoinTable hashtable, + MinimalTuple tuple, + uint32 hashvalue) +{ + int batchno; + int bucketno; + dsa_pointer shared; + HashJoinTuple copyTuple; + ParallelHashJoinState *pstate = hashtable->parallel_state; + bool spill = true; + bool hashtable_full = hashtable->batches[0].shared->size >= pstate->space_allowed; + size_t tuple_size = + MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); - idx += MAXALIGN(HJTUPLE_OVERHEAD + - HJTUPLE_MINTUPLE(hashTuple)->t_len); + ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); + + /* + * We don't take a lock to read pstate->space_allowed because it should + * not change during execution of the hash join + */ + + Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER); + if (batchno == 0 && !hashtable_full) + { + copyTuple = ExecParallelHashTupleAlloc(hashtable, + HJTUPLE_OVERHEAD + tuple->t_len, + &shared); + + /* + * TODO: do we need to check if growth was set to + * PHJ_GROWTH_SPILL_BATCH0? + */ + if (copyTuple) + { + /* Store the hash value in the HashJoinTuple header. */ + copyTuple->hashvalue = hashvalue; + memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len); + + /* Push it onto the front of the bucket's list */ + ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], + copyTuple, shared); + pg_atomic_add_fetch_u64(&hashtable->batches[0].shared->ntuples_in_memory, 1); + + spill = false; } + } - /* Free this chunk. */ - dsa_free(hashtable->area, chunk_shared); + if (spill) + { - CHECK_FOR_INTERRUPTS(); + tupleMetadata metadata; + + ParallelHashJoinBatchAccessor *batch_accessor = &(hashtable->batches[batchno]); + + /* + * It is okay to use backend local here because force spill tuple is + * only done during repartitioning when we can't grow batches so won't + * make decision based on it and will merge counters during deciding + * and during evictbatch0 which can ony be done on a batch that is + * already fallback so we won't make decision on it and will merge + * counters after the build phase + */ + batch_accessor->estimated_size += tuple_size; + metadata.hashvalue = hashvalue; + + sts_puttuple(batch_accessor->inner_tuples, + &metadata, + tuple); } + ++hashtable->batches[batchno].ntuples; + ++hashtable->batches[0].old_ntuples; } /* @@ -1371,24 +1617,41 @@ ExecParallelHashRepartitionRest(HashJoinTable hashtable) /* Scan one partition from the previous generation. */ sts_begin_parallel_scan(old_inner_tuples[i]); - while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &hashvalue))) + while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], + &hashvalue))) { - size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); int bucketno; int batchno; + size_t tuple_size; + tupleMetadata metadata; + ParallelHashJoinBatchAccessor *batch_accessor; + /* Decide which partition it goes to in the new generation. */ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); - hashtable->batches[batchno].estimated_size += tuple_size; - ++hashtable->batches[batchno].ntuples; - ++hashtable->batches[i].old_ntuples; + tuple_size = + MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); - /* Store the tuple its new batch. */ - sts_puttuple(hashtable->batches[batchno].inner_tuples, - &hashvalue, tuple); + batch_accessor = &(hashtable->batches[batchno]); + /* + * It is okay to use backend local here because force spill tuple + * is only done during repartitioning when we can't grow batches + * so won't make decision based on it and will merge counters + * during deciding and during evictbatch0 which can ony be done on + * a batch that is already fallback so we won't make decision on + * it and will merge counters after the build phase + */ + batch_accessor->estimated_size += tuple_size; + metadata.hashvalue = hashvalue; + + sts_puttuple(batch_accessor->inner_tuples, + &metadata, + tuple); + ++hashtable->batches[batchno].ntuples; + ++hashtable->batches[i].old_ntuples; CHECK_FOR_INTERRUPTS(); } sts_end_parallel_scan(old_inner_tuples[i]); @@ -1705,7 +1968,7 @@ retry: hashTuple = ExecParallelHashTupleAlloc(hashtable, HJTUPLE_OVERHEAD + tuple->t_len, &shared); - if (hashTuple == NULL) + if (!hashTuple) goto retry; /* Store the hash value in the HashJoinTuple header. */ @@ -1715,10 +1978,13 @@ retry: /* Push it onto the front of the bucket's list */ ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], hashTuple, shared); + pg_atomic_add_fetch_u64(&hashtable->batches[0].shared->ntuples_in_memory, 1); + } else { size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); + tupleMetadata metadata; Assert(batchno > 0); @@ -1731,7 +1997,11 @@ retry: Assert(hashtable->batches[batchno].preallocated >= tuple_size); hashtable->batches[batchno].preallocated -= tuple_size; - sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue, + + metadata.hashvalue = hashvalue; + + sts_puttuple(hashtable->batches[batchno].inner_tuples, + &metadata, tuple); } ++hashtable->batches[batchno].ntuples; @@ -1746,10 +2016,11 @@ retry: * to other batches or to run out of memory, and should only be called with * tuples that belong in the current batch once growth has been disabled. */ -void +MinimalTuple ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, TupleTableSlot *slot, - uint32 hashvalue) + uint32 hashvalue, + int read_participant) { bool shouldFree; MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); @@ -1758,19 +2029,26 @@ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, int batchno; int bucketno; + ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); Assert(batchno == hashtable->curbatch); + hashTuple = ExecParallelHashTupleAlloc(hashtable, HJTUPLE_OVERHEAD + tuple->t_len, &shared); + if (!hashTuple) + return NULL; + hashTuple->hashvalue = hashvalue; memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len); HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple)); ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], hashTuple, shared); + pg_atomic_add_fetch_u64(&hashtable->batches[hashtable->curbatch].shared->ntuples_in_memory, 1); if (shouldFree) heap_free_minimal_tuple(tuple); + return tuple; } /* @@ -2602,6 +2880,12 @@ ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt) pcxt->nworkers * sizeof(HashInstrumentation); node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size); + /* + * TODO: the linked list which is being used for fallback stats needs + * space allocated for it in shared memory as well. For now, it seems to + * be coincidentally working + */ + /* Each per-worker area must start out as zeroes. */ memset(node->shared_info, 0, size); @@ -2701,6 +2985,11 @@ ExecHashAccumInstrumentation(HashInstrumentation *instrument, hashtable->nbatch_original); instrument->space_peak = Max(instrument->space_peak, hashtable->spacePeak); + + /* + * TODO: this doesn't work right now in case of rescan (doesn't get max) + */ + instrument->fallback_batches_stats = hashtable->fallback_batches_stats; } /* @@ -2775,6 +3064,146 @@ dense_alloc(HashJoinTable hashtable, Size size) return ptr; } +/* + * Assume caller has a lock or is behind a barrier and has the right + * to change these values + */ +inline void +ExecParallelHashTableRecycle(HashJoinTable hashtable) +{ + ParallelHashJoinBatchAccessor *batch_accessor = &(hashtable->batches[hashtable->curbatch]); + ParallelHashJoinBatch *batch = batch_accessor->shared; + + dsa_pointer_atomic *buckets = (dsa_pointer_atomic *) + dsa_get_address(hashtable->area, batch->buckets); + + for (size_t i = 0; i < hashtable->nbuckets; ++i) + dsa_pointer_atomic_write(&buckets[i], InvalidDsaPointer); + batch->size = 0; + batch->space_exhausted = false; + + /* + * TODO: I'm not sure that we want to reset this when this function is + * called to recycle the hashtable during the build stage as part of + * evicting batch 0. It seems like it would be okay since a worker does + * not have the right to over-allocate now. So, for a fallback batch, + * at_least_one_chunk doesn't matter It seems like it may not matter at + * all anymore... + */ + batch_accessor->at_least_one_chunk = false; + pg_atomic_exchange_u64(&batch->ntuples_in_memory, 0); +} + +/* + * The eviction phase machine is responsible for evicting tuples from the + * hashtable during the Build stage of executing a parallel-aware parallel + * hash join. After increasing the number of batches in + * ExecParallelHashIncreaseNumBatches(), in the PHJ_GROW_BATCHES_DECIDING + * phase, if the batch 0 hashtable meets the criteria for falling back + * and is marked a fallback batch, the next time an inserted tuple would + * exceed the space_allowed, instead, trigger an eviction. Evict all + * batch 0 tuples to spill files in batch 0 inner side SharedTuplestore. + */ +static void +ExecParallelHashTableEvictBatch0(HashJoinTable hashtable) +{ + + ParallelHashJoinState *pstate = hashtable->parallel_state; + ParallelHashJoinBatchAccessor *batch0_accessor = &(hashtable->batches[0]); + + /* + * No other workers must be inserting tuples into the hashtable once + * growth has been set to PHJ_EVICT. Otherwise, the below will not work + * correctly. This should be okay since the same assumptions are made in + * the increase batch machine. + */ + BarrierAttach(&pstate->eviction_barrier); + switch (PHJ_EVICT_PHASE(BarrierPhase(&pstate->eviction_barrier))) + { + case PHJ_EVICT_ELECTING: + if (BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_ELECT)) + { + pstate->chunk_work_queue = batch0_accessor->shared->chunks; + batch0_accessor->shared->chunks = InvalidDsaPointer; + ExecParallelHashTableRecycle(hashtable); + } + /* FALLTHROUGH */ + case PHJ_EVICT_RESETTING: + BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_RESET); + /* FALLTHROUGH */ + case PHJ_EVICT_SPILLING: + { + dsa_pointer chunk_shared; + HashMemoryChunk chunk; + + /* + * TODO: Do I need to do this here? am I guaranteed to have + * the correct shared memory reference to the batches array + * already? + */ + ParallelHashJoinBatch *batches; + ParallelHashJoinBatch *batch0; + + batches = (ParallelHashJoinBatch *) + dsa_get_address(hashtable->area, pstate->batches); + batch0 = NthParallelHashJoinBatch(batches, 0); + Assert(batch0 == hashtable->batches[0].shared); + + ExecParallelHashTableSetCurrentBatch(hashtable, 0); + + while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared))) + { + size_t idx = 0; + + while (idx < chunk->used) + { + tupleMetadata metadata; + + size_t tuple_size; + MinimalTuple minTuple; + HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx); + + minTuple = HJTUPLE_MINTUPLE(hashTuple); + + tuple_size = + MAXALIGN(HJTUPLE_OVERHEAD + minTuple->t_len); + + /* + * It is okay to use backend local here because can + * ony be done on a batch that is already fallback so + * we won't make decision on it and will merge + * counters after the build phase + */ + batch0_accessor->estimated_size += tuple_size; + metadata.hashvalue = hashTuple->hashvalue; + + sts_puttuple(batch0_accessor->inner_tuples, + &metadata, + minTuple); + + idx += MAXALIGN(HJTUPLE_OVERHEAD + + HJTUPLE_MINTUPLE(hashTuple)->t_len); + } + dsa_free(hashtable->area, chunk_shared); + + CHECK_FOR_INTERRUPTS(); + } + BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_SPILL); + } + /* FALLTHROUGH */ + case PHJ_EVICT_FINISHING: + + /* + * TODO: Is this phase needed? + */ + if (BarrierArriveAndWait(&pstate->eviction_barrier, WAIT_EVENT_HASH_EVICT_FINISH)) + pstate->growth = PHJ_GROWTH_OK; + /* FALLTHROUGH */ + case PHJ_EVICT_DONE: + BarrierArriveAndDetach(&pstate->eviction_barrier); + } +} + /* * Allocate space for a tuple in shared dense storage. This is equivalent to * dense_alloc but for Parallel Hash using shared memory. @@ -2787,7 +3216,8 @@ dense_alloc(HashJoinTable hashtable, Size size) * possibility that the tuple no longer belongs in the same batch). */ static HashJoinTuple -ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, +ExecParallelHashTupleAlloc(HashJoinTable hashtable, + size_t size, dsa_pointer *shared) { ParallelHashJoinState *pstate = hashtable->parallel_state; @@ -2828,7 +3258,8 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, * Check if we need to help increase the number of buckets or batches. */ if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES || - pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS) + pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS || + pstate->growth == PHJ_GROWTH_SPILL_BATCH0) { ParallelHashGrowth growth = pstate->growth; @@ -2840,6 +3271,8 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, ExecParallelHashIncreaseNumBatches(hashtable); else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS) ExecParallelHashIncreaseNumBuckets(hashtable); + else if (growth == PHJ_GROWTH_SPILL_BATCH0) + ExecParallelHashTableEvictBatch0(hashtable); /* The caller must retry. */ return NULL; @@ -2852,7 +3285,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, chunk_size = HASH_CHUNK_SIZE; /* Check if it's time to grow batches or buckets. */ - if (pstate->growth != PHJ_GROWTH_DISABLED) + if (pstate->growth != PHJ_GROWTH_DISABLED && pstate->growth != PHJ_GROWTH_LOADING) { Assert(curbatch == 0); Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER); @@ -2861,16 +3294,26 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, * Check if our space limit would be exceeded. To avoid choking on * very large tuples or very low hash_mem setting, we'll always allow * each backend to allocate at least one chunk. + * + * If the batch has already been marked to fall back, then we don't + * need to worry about having allocated one chunk -- we should start + * evicting tuples. */ - if (hashtable->batches[0].at_least_one_chunk && - hashtable->batches[0].shared->size + + LWLockAcquire(&hashtable->batches[0].shared->lock, LW_EXCLUSIVE); + if (hashtable->batches[0].shared->size + chunk_size > pstate->space_allowed) { - pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; - hashtable->batches[0].shared->space_exhausted = true; - LWLockRelease(&pstate->lock); - - return NULL; + if (hashtable->batches[0].shared->hashloop_fallback || hashtable->batches[0].at_least_one_chunk) + { + if (hashtable->batches[0].shared->hashloop_fallback) + pstate->growth = PHJ_GROWTH_SPILL_BATCH0; + else if (hashtable->batches[0].at_least_one_chunk) + pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; + hashtable->batches[0].shared->space_exhausted = true; + LWLockRelease(&pstate->lock); + LWLockRelease(&hashtable->batches[0].shared->lock); + return NULL; + } } /* Check if our load factor limit would be exceeded. */ @@ -2887,14 +3330,60 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, { pstate->growth = PHJ_GROWTH_NEED_MORE_BUCKETS; LWLockRelease(&pstate->lock); + LWLockRelease(&hashtable->batches[0].shared->lock); return NULL; } } + LWLockRelease(&hashtable->batches[0].shared->lock); } + /* + * TODO: should I care about hashtable->batches[b].at_least_one_chunk + * here? + */ + if (pstate->growth == PHJ_GROWTH_LOADING) + { + int b = hashtable->curbatch; + + LWLockAcquire(&hashtable->batches[b].shared->lock, LW_EXCLUSIVE); + if (hashtable->batches[b].shared->hashloop_fallback && + (hashtable->batches[b].shared->space_exhausted || + hashtable->batches[b].shared->size + chunk_size > pstate->space_allowed)) + { + bool space_exhausted = hashtable->batches[b].shared->space_exhausted; + + if (!space_exhausted) + hashtable->batches[b].shared->space_exhausted = true; + LWLockRelease(&pstate->lock); + LWLockRelease(&hashtable->batches[b].shared->lock); + return NULL; + } + LWLockRelease(&hashtable->batches[b].shared->lock); + } + + /* + * If not even one chunk would fit in the space_allowed, there isn't + * anything we can do to avoid exceeding space_allowed. Also, if we keep + * the rule that a backend should be allowed to allocate at least one + * chunk, then we will end up tripping this assert some of the time unless + * we make that exception (should we make that exception?) TODO: should + * memory settings < chunk_size even be allowed. Should it error out? + * should we be able to make this assertion? + * Assert(hashtable->batches[hashtable->curbatch].shared->size + + * chunk_size <= pstate->space_allowed); + */ + /* We are cleared to allocate a new chunk. */ chunk_shared = dsa_allocate(hashtable->area, chunk_size); + + /* + * The chunk is accounted for in the hashtable size only. Even though + * batch 0 can spill, we don't need to track this allocated chunk in the + * estimated_stripe_size member because we check the size member when + * determining if the hashtable is too big, and, we will only ever number + * stripes (starting with 1 instead of 0 for batch 0) in the spill file. + */ hashtable->batches[curbatch].shared->size += chunk_size; hashtable->batches[curbatch].at_least_one_chunk = true; @@ -2964,21 +3453,40 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) { ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i]; ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i); + SharedBits *sbits = ParallelHashJoinBatchOuterBits(shared, pstate->nparticipants); char name[MAXPGPATH]; + char sbname[MAXPGPATH]; + + shared->hashloop_fallback = false; + pg_atomic_init_flag(&shared->overflow_required); + pg_atomic_init_u64(&shared->ntuples_in_memory, 0); + /* TODO: is it okay to use the same tranche for this lock? */ + LWLockInitialize(&shared->lock, LWTRANCHE_PARALLEL_HASH_JOIN); + shared->nstripes = 0; /* * All members of shared were zero-initialized. We just need to set * up the Barrier. */ BarrierInit(&shared->batch_barrier, 0); + BarrierInit(&shared->stripe_barrier, 0); + + /* Batch 0 doesn't need to be loaded. */ if (i == 0) { - /* Batch 0 doesn't need to be loaded. */ + shared->nstripes = 1; BarrierAttach(&shared->batch_barrier); - while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_PROBING) + while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_STRIPING) BarrierArriveAndWait(&shared->batch_barrier, 0); BarrierDetach(&shared->batch_barrier); + + BarrierAttach(&shared->stripe_barrier); + while (BarrierPhase(&shared->stripe_barrier) < PHJ_STRIPE_PROBING) + BarrierArriveAndWait(&shared->stripe_barrier, 0); + BarrierDetach(&shared->stripe_barrier); } + /* why isn't done initialized here ? */ + accessor->done = PHJ_BATCH_ACCESSOR_NOT_DONE; /* Initialize accessor state. All members were zero-initialized. */ accessor->shared = shared; @@ -2989,7 +3497,7 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) sts_initialize(ParallelHashJoinBatchInner(shared), pstate->nparticipants, ParallelWorkerNumber + 1, - sizeof(uint32), + sizeof(tupleMetadata), SHARED_TUPLESTORE_SINGLE_PASS, &pstate->fileset, name); @@ -2999,10 +3507,14 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) pstate->nparticipants), pstate->nparticipants, ParallelWorkerNumber + 1, - sizeof(uint32), + sizeof(tupleMetadata), SHARED_TUPLESTORE_SINGLE_PASS, &pstate->fileset, name); + snprintf(sbname, MAXPGPATH, "%s.bitmaps", name); + /* Use the same SharedFileset for the SharedTupleStore and SharedBits */ + accessor->sba = sb_initialize(sbits, pstate->nparticipants, + ParallelWorkerNumber + 1, &pstate->fileset, sbname); } MemoryContextSwitchTo(oldcxt); @@ -3051,8 +3563,8 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable) * It's possible for a backend to start up very late so that the whole * join is finished and the shm state for tracking batches has already * been freed by ExecHashTableDetach(). In that case we'll just leave - * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives - * up early. + * hashtable->batches as NULL so that ExecParallelHashJoinAdvanceBatch() + * gives up early. */ if (!DsaPointerIsValid(pstate->batches)) return; @@ -3074,10 +3586,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable) { ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i]; ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i); + SharedBits *sbits = ParallelHashJoinBatchOuterBits(shared, pstate->nparticipants); accessor->shared = shared; accessor->preallocated = 0; - accessor->done = false; + accessor->done = PHJ_BATCH_ACCESSOR_NOT_DONE; accessor->inner_tuples = sts_attach(ParallelHashJoinBatchInner(shared), ParallelWorkerNumber + 1, @@ -3087,6 +3600,7 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable) pstate->nparticipants), ParallelWorkerNumber + 1, &pstate->fileset); + accessor->sba = sb_attach(sbits, ParallelWorkerNumber + 1, &pstate->fileset); } MemoryContextSwitchTo(oldcxt); @@ -3169,6 +3683,18 @@ ExecHashTableDetachBatch(HashJoinTable hashtable) } } +bool +ExecHashTableDetachStripe(HashJoinTable hashtable) +{ + int curbatch = hashtable->curbatch; + ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared; + Barrier *stripe_barrier = &batch->stripe_barrier; + + BarrierDetach(stripe_barrier); + hashtable->curstripe = STRIPE_DETACHED; + return false; +} + /* * Detach from all shared resources. If we are last to detach, clean up. */ @@ -3326,7 +3852,6 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size) ParallelHashJoinBatchAccessor *batch = &hashtable->batches[batchno]; size_t want = Max(size, HASH_CHUNK_SIZE - HASH_CHUNK_HEADER_SIZE); - Assert(batchno > 0); Assert(batchno < hashtable->nbatch); Assert(size == MAXALIGN(size)); @@ -3334,7 +3859,8 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size) /* Has another participant commanded us to help grow? */ if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES || - pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS) + pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS || + pstate->growth == PHJ_GROWTH_SPILL_BATCH0) { ParallelHashGrowth growth = pstate->growth; @@ -3343,18 +3869,21 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size) ExecParallelHashIncreaseNumBatches(hashtable); else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS) ExecParallelHashIncreaseNumBuckets(hashtable); + else if (growth == PHJ_GROWTH_SPILL_BATCH0) + ExecParallelHashTableEvictBatch0(hashtable); return false; } if (pstate->growth != PHJ_GROWTH_DISABLED && batch->at_least_one_chunk && - (batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE - > pstate->space_allowed)) + (batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE > pstate->space_allowed) && + !batch->shared->hashloop_fallback) { /* * We have determined that this batch would exceed the space budget if - * loaded into memory. Command all participants to help repartition. + * loaded into memory. It is also not yet marked as a fallback batch. + * Command all participants to help repartition. */ batch->shared->space_exhausted = true; pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 5532b91a71..eb67aceebb 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -92,6 +92,27 @@ * hash_mem of all participants to create a large shared hash table. If that * turns out either at planning or execution time to be impossible then we * fall back to regular hash_mem sized hash tables. + * If a given batch causes the number of batches to be doubled and data skew + * causes too few or too many tuples to be relocated to the child of this batch, + * the batch which is now home to the skewed tuples is marked as a "fallback" + * batch. This means that it will be processed using multiple loops -- + * each loop probing an arbitrary stripe of tuples from this batch + * which fit in hash_mem or combined hash_mem. + * This batch is no longer permitted to cause growth in the number of batches. + * + * When the inner side of a fallback batch is loaded into memory, stripes of + * arbitrary tuples totaling hash_mem or combined hash_mem in size are loaded + * into the hashtable. After probing this stripe, the outer side batch is + * rewound and the next stripe is loaded. Each stripe of the inner batch is + * probed until all tuples from that batch have been processed. + * + * Tuples that match are emitted (depending on the join semantics of the + * particular join type) during probing of the stripe. However, in order to make + * left outer join work, unmatched tuples cannot be emitted NULL-extended until + * all stripes have been probed. To address this, a bitmap is created with a bit + * for each tuple of the outer side. If a tuple on the outer side matches a + * tuple from the inner, the corresponding bit is set. At the end of probing all + * stripes, the executor scans the bitmap and emits unmatched outer tuples. * * To avoid deadlocks, we never wait for any barrier unless it is known that * all other backends attached to it are actively executing the node or have @@ -126,7 +147,7 @@ #define HJ_SCAN_BUCKET 3 #define HJ_FILL_OUTER_TUPLE 4 #define HJ_FILL_INNER_TUPLES 5 -#define HJ_NEED_NEW_BATCH 6 +#define HJ_NEED_NEW_STRIPE 6 /* Returns true if doing null-fill on outer relation */ #define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL) @@ -143,10 +164,91 @@ static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, uint32 *hashvalue, TupleTableSlot *tupleSlot); +static int ExecHashJoinLoadStripe(HashJoinState *hjstate); static bool ExecHashJoinNewBatch(HashJoinState *hjstate); static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate); +static bool ExecParallelHashJoinLoadStripe(HashJoinState *hjstate); static void ExecParallelHashJoinPartitionOuter(HashJoinState *node); +static bool checkbit(HashJoinState *hjstate); +static void set_match_bit(HashJoinState *hjstate); + +static pg_attribute_always_inline bool + IsHashloopFallback(HashJoinTable hashtable); + +#define UINT_BITS (sizeof(unsigned int) * CHAR_BIT) + +static void +set_match_bit(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + BufFile *statusFile = hashtable->hashloopBatchFile[hashtable->curbatch]; + int tupindex = hjstate->hj_CurNumOuterTuples - 1; + size_t unit_size = sizeof(hjstate->hj_CurOuterMatchStatus); + off_t offset = tupindex / UINT_BITS * unit_size; + + int fileno; + off_t cursor; + + BufFileTell(statusFile, &fileno, &cursor); + + /* Extend the statusFile if this is stripe zero. */ + if (hashtable->curstripe == 0) + { + for (; cursor < offset + unit_size; cursor += unit_size) + { + hjstate->hj_CurOuterMatchStatus = 0; + BufFileWrite(statusFile, &hjstate->hj_CurOuterMatchStatus, unit_size); + } + } + + if (cursor != offset) + BufFileSeek(statusFile, 0, offset, SEEK_SET); + + BufFileRead(statusFile, &hjstate->hj_CurOuterMatchStatus, unit_size); + BufFileSeek(statusFile, 0, -unit_size, SEEK_CUR); + + hjstate->hj_CurOuterMatchStatus |= 1U << tupindex % UINT_BITS; + BufFileWrite(statusFile, &hjstate->hj_CurOuterMatchStatus, unit_size); +} +/* return true if bit is set and false if not */ +static bool +checkbit(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int curbatch = hashtable->curbatch; + BufFile *outer_match_statuses; + + int bitno = hjstate->hj_EmitOuterTupleId % UINT_BITS; + + hjstate->hj_EmitOuterTupleId++; + outer_match_statuses = hjstate->hj_HashTable->hashloopBatchFile[curbatch]; + + /* + * if current chunk of bitmap is exhausted, read next chunk of bitmap from + * outer_match_status_file + */ + if (bitno == 0) + BufFileRead(outer_match_statuses, &hjstate->hj_CurOuterMatchStatus, + sizeof(hjstate->hj_CurOuterMatchStatus)); + + /* + * check if current tuple's match bit is set in outer match status file + */ + return hjstate->hj_CurOuterMatchStatus & (1U << bitno); +} + +static bool +IsHashloopFallback(HashJoinTable hashtable) +{ + if (hashtable->parallel_state) + return hashtable->batches[hashtable->curbatch].shared->hashloop_fallback; + + if (!hashtable->hashloopBatchFile) + return false; + + return hashtable->hashloopBatchFile[hashtable->curbatch]; +} /* ---------------------------------------------------------------- * ExecHashJoinImpl @@ -290,6 +392,12 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) hashNode->hashtable = hashtable; (void) MultiExecProcNode((PlanState *) hashNode); + /* + * After building the hashtable, stripe 0 of batch 0 will have + * been loaded. + */ + hashtable->curstripe = 0; + /* * If the inner relation is completely empty, and we're not * doing a left outer join, we can quit without scanning the @@ -324,21 +432,21 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) * If multi-batch, we need to hash the outer relation * up front. */ - if (hashtable->nbatch > 1) + if (hashtable->nbatch > 1 || (hashtable->nbatch == 1 && hashtable->batches[0].shared->hashloop_fallback)) ExecParallelHashJoinPartitionOuter(node); BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_HASH_OUTER); + } Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE); /* Each backend should now select a batch to work on. */ hashtable->curbatch = -1; - node->hj_JoinState = HJ_NEED_NEW_BATCH; - continue; + if (!ExecParallelHashJoinNewBatch(node)) + return NULL; } - else - node->hj_JoinState = HJ_NEED_NEW_OUTER; + node->hj_JoinState = HJ_NEED_NEW_OUTER; /* FALL THRU */ @@ -365,12 +473,18 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) node->hj_JoinState = HJ_FILL_INNER_TUPLES; } else - node->hj_JoinState = HJ_NEED_NEW_BATCH; + node->hj_JoinState = HJ_NEED_NEW_STRIPE; continue; } econtext->ecxt_outertuple = outerTupleSlot; - node->hj_MatchedOuter = false; + + /* + * Don't reset hj_MatchedOuter after the first stripe as it + * would cancel out whatever we found before + */ + if (node->hj_HashTable->curstripe == 0) + node->hj_MatchedOuter = false; /* * Find the corresponding bucket for this tuple in the main @@ -386,9 +500,15 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) /* * The tuple might not belong to the current batch (where * "current batch" includes the skew buckets if any). + * + * This should only be done once per tuple per batch. If a + * batch "falls back", its inner side will be split into + * stripes. Any displaced outer tuples should only be + * relocated while probing the first stripe of the inner side. */ if (batchno != hashtable->curbatch && - node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) + node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO && + node->hj_HashTable->curstripe == 0) { bool shouldFree; MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, @@ -410,6 +530,13 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) continue; } + /* + * While probing the phantom stripe, don't increment + * hj_CurNumOuterTuples or extend the bitmap + */ + if (!parallel && hashtable->curstripe != PHANTOM_STRIPE) + node->hj_CurNumOuterTuples++; + /* OK, let's scan the bucket for matches */ node->hj_JoinState = HJ_SCAN_BUCKET; @@ -455,6 +582,25 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) { node->hj_MatchedOuter = true; + if (HJ_FILL_OUTER(node) && IsHashloopFallback(hashtable)) + { + /* + * Each bit corresponds to a single tuple. Setting the + * match bit keeps track of which tuples were matched + * for batches which are using the block nested + * hashloop fallback method. It persists this match + * status across multiple stripes of tuples, each of + * which is loaded into the hashtable and probed. The + * outer match status file is the cumulative match + * status of outer tuples for a given batch across all + * stripes of that inner side batch. + */ + if (parallel) + sb_setbit(hashtable->batches[hashtable->curbatch].sba, econtext->ecxt_outertuple->tts_tuplenum); + else + set_match_bit(node); + } + if (parallel) { /* @@ -488,8 +634,17 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) * continue with next outer tuple. */ if (node->js.single_match) + { node->hj_JoinState = HJ_NEED_NEW_OUTER; + /* + * Only consider returning the tuple while on the + * first stripe. + */ + if (node->hj_HashTable->curstripe != 0) + continue; + } + if (otherqual == NULL || ExecQual(otherqual, econtext)) return ExecProject(node->js.ps.ps_ProjInfo); else @@ -508,6 +663,22 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) */ node->hj_JoinState = HJ_NEED_NEW_OUTER; + if (IsHashloopFallback(hashtable) && HJ_FILL_OUTER(node)) + { + if (hashtable->curstripe != PHANTOM_STRIPE) + continue; + + if (parallel) + { + ParallelHashJoinBatchAccessor *accessor = + &node->hj_HashTable->batches[node->hj_HashTable->curbatch]; + + node->hj_MatchedOuter = sb_checkbit(accessor->sba, econtext->ecxt_outertuple->tts_tuplenum); + } + else + node->hj_MatchedOuter = checkbit(node); + } + if (!node->hj_MatchedOuter && HJ_FILL_OUTER(node)) { @@ -534,7 +705,7 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) if (!ExecScanHashTableForUnmatched(node, econtext)) { /* no more unmatched tuples */ - node->hj_JoinState = HJ_NEED_NEW_BATCH; + node->hj_JoinState = HJ_NEED_NEW_STRIPE; continue; } @@ -550,19 +721,23 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) InstrCountFiltered2(node, 1); break; - case HJ_NEED_NEW_BATCH: + case HJ_NEED_NEW_STRIPE: /* - * Try to advance to next batch. Done if there are no more. + * Try to advance to next stripe. Then try to advance to the + * next batch if there are no more stripes in this batch. Done + * if there are no more batches. */ if (parallel) { - if (!ExecParallelHashJoinNewBatch(node)) + if (!ExecParallelHashJoinLoadStripe(node) && + !ExecParallelHashJoinNewBatch(node)) return NULL; /* end of parallel-aware join */ } else { - if (!ExecHashJoinNewBatch(node)) + if (!ExecHashJoinLoadStripe(node) && + !ExecHashJoinNewBatch(node)) return NULL; /* end of parallel-oblivious join */ } node->hj_JoinState = HJ_NEED_NEW_OUTER; @@ -751,6 +926,8 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; hjstate->hj_MatchedOuter = false; hjstate->hj_OuterNotEmpty = false; + hjstate->hj_CurNumOuterTuples = 0; + hjstate->hj_CurOuterMatchStatus = 0; return hjstate; } @@ -890,10 +1067,16 @@ ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, /* * In the Parallel Hash case we only run the outer plan directly for * single-batch hash joins. Otherwise we have to go to batch files, even - * for batch 0. + * for batch 0. For a single-batch hash join which, due to data skew, has + * multiple stripes and is a "fallback" batch, we must still save the + * outer tuples into batch files. */ - if (curbatch == 0 && hashtable->nbatch == 1) + LWLockAcquire(&hashtable->batches[0].shared->lock, LW_SHARED); + + if (curbatch == 0 && hashtable->nbatch == 1 && !hashtable->batches[0].shared->hashloop_fallback) { + LWLockRelease(&hashtable->batches[0].shared->lock); + slot = ExecProcNode(outerNode); while (!TupIsNull(slot)) @@ -917,21 +1100,36 @@ ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, } else if (curbatch < hashtable->nbatch) { + + tupleMetadata metadata; MinimalTuple tuple; - tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples, - hashvalue); + LWLockRelease(&hashtable->batches[0].shared->lock); + + tuple = + sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples, + &metadata); + *hashvalue = metadata.hashvalue; + if (tuple != NULL) { ExecForceStoreMinimalTuple(tuple, hjstate->hj_OuterTupleSlot, false); + + /* + * TODO: should we use tupleid instead of position in the serial + * case too? + */ + hjstate->hj_OuterTupleSlot->tts_tuplenum = metadata.tupleid; slot = hjstate->hj_OuterTupleSlot; return slot; } else ExecClearTuple(hjstate->hj_OuterTupleSlot); } + else + LWLockRelease(&hashtable->batches[0].shared->lock); /* End of this batch */ return NULL; @@ -949,24 +1147,37 @@ ExecHashJoinNewBatch(HashJoinState *hjstate) HashJoinTable hashtable = hjstate->hj_HashTable; int nbatch; int curbatch; - BufFile *innerFile; - TupleTableSlot *slot; - uint32 hashvalue; + BufFile *innerFile = NULL; + BufFile *outerFile = NULL; nbatch = hashtable->nbatch; curbatch = hashtable->curbatch; - if (curbatch > 0) + /* + * We no longer need the previous outer batch file; close it right away to + * free disk space. + */ + if (hashtable->outerBatchFile && hashtable->outerBatchFile[curbatch]) { - /* - * We no longer need the previous outer batch file; close it right - * away to free disk space. - */ - if (hashtable->outerBatchFile[curbatch]) - BufFileClose(hashtable->outerBatchFile[curbatch]); + BufFileClose(hashtable->outerBatchFile[curbatch]); hashtable->outerBatchFile[curbatch] = NULL; } - else /* we just finished the first batch */ + if (IsHashloopFallback(hashtable)) + { + BufFileClose(hashtable->hashloopBatchFile[curbatch]); + hashtable->hashloopBatchFile[curbatch] = NULL; + } + + /* + * We are surely done with the inner batch file now + */ + if (hashtable->innerBatchFile && hashtable->innerBatchFile[curbatch]) + { + BufFileClose(hashtable->innerBatchFile[curbatch]); + hashtable->innerBatchFile[curbatch] = NULL; + } + + if (curbatch == 0) /* we just finished the first batch */ { /* * Reset some of the skew optimization state variables, since we no @@ -1030,55 +1241,168 @@ ExecHashJoinNewBatch(HashJoinState *hjstate) return false; /* no more batches */ hashtable->curbatch = curbatch; + hashtable->curstripe = STRIPE_DETACHED; + hjstate->hj_CurNumOuterTuples = 0; - /* - * Reload the hash table with the new inner batch (which could be empty) - */ - ExecHashTableReset(hashtable); + if (hashtable->innerBatchFile && hashtable->innerBatchFile[curbatch]) + innerFile = hashtable->innerBatchFile[curbatch]; + + if (innerFile && BufFileSeek(innerFile, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + /* Need to rewind outer when this is the first stripe of a new batch */ + if (hashtable->outerBatchFile && hashtable->outerBatchFile[curbatch]) + outerFile = hashtable->outerBatchFile[curbatch]; + + if (outerFile && BufFileSeek(outerFile, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + ExecHashJoinLoadStripe(hjstate); + return true; +} - innerFile = hashtable->innerBatchFile[curbatch]; +static inline void +InstrIncrBatchStripes(List *fallback_batches_stats, int curbatch) +{ + ListCell *lc; - if (innerFile != NULL) + foreach(lc, fallback_batches_stats) { - if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rewind hash-join temporary file"))); + FallbackBatchStats *fallback_batch_stats = lfirst(lc); - while ((slot = ExecHashJoinGetSavedTuple(hjstate, - innerFile, - &hashvalue, - hjstate->hj_HashTupleSlot))) + if (fallback_batch_stats->batchno == curbatch) { - /* - * NOTE: some tuples may be sent to future batches. Also, it is - * possible for hashtable->nbatch to be increased here! - */ - ExecHashTableInsert(hashtable, slot, hashvalue); + fallback_batch_stats->numstripes++; + break; } - - /* - * after we build the hash table, the inner batch file is no longer - * needed - */ - BufFileClose(innerFile); - hashtable->innerBatchFile[curbatch] = NULL; } +} + +static inline void +InstrAppendParallelBatchStripes(List **fallback_batches_stats, int curbatch, int nstripes) +{ + FallbackBatchStats *fallback_batch_stats; + + fallback_batch_stats = palloc(sizeof(FallbackBatchStats)); + fallback_batch_stats->batchno = curbatch; + /* Display the total number of stripes as a 1-indexed number */ + fallback_batch_stats->numstripes = nstripes + 1; + *fallback_batches_stats = lappend(*fallback_batches_stats, fallback_batch_stats); +} + +/* + * Returns false when the inner batch file is exhausted + */ +static int +ExecHashJoinLoadStripe(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int curbatch = hashtable->curbatch; + TupleTableSlot *slot; + uint32 hashvalue; + bool loaded_inner = false; + + if (hashtable->curstripe == PHANTOM_STRIPE) + return false; /* * Rewind outer batch file (if present), so that we can start reading it. + * TODO: This is only necessary if this is not the first stripe of the + * batch */ - if (hashtable->outerBatchFile[curbatch] != NULL) + if (hashtable->outerBatchFile && hashtable->outerBatchFile[curbatch]) { if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not rewind hash-join temporary file"))); + errmsg("could not rewind hash-join temporary file: %m"))); + } + if (hashtable->innerBatchFile && hashtable->innerBatchFile[curbatch] && hashtable->curbatch == 0 && hashtable->curstripe == 0) + { + if (BufFileSeek(hashtable->innerBatchFile[curbatch], 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); } - return true; + hashtable->curstripe++; + + if (!hashtable->innerBatchFile || !hashtable->innerBatchFile[curbatch]) + return false; + + /* + * Reload the hash table with the new inner stripe + */ + ExecHashTableReset(hashtable); + + while ((slot = ExecHashJoinGetSavedTuple(hjstate, + hashtable->innerBatchFile[curbatch], + &hashvalue, + hjstate->hj_HashTupleSlot))) + { + /* + * NOTE: some tuples may be sent to future batches. Also, it is + * possible for hashtable->nbatch to be increased here! + */ + uint32 hashTupleSize; + + /* + * TODO: wouldn't it be cool if this returned the size of the tuple + * inserted + */ + ExecHashTableInsert(hashtable, slot, hashvalue); + loaded_inner = true; + + if (!IsHashloopFallback(hashtable)) + continue; + + hashTupleSize = slot->tts_ops->get_minimal_tuple(slot)->t_len + HJTUPLE_OVERHEAD; + + if (hashtable->spaceUsed + hashTupleSize + + hashtable->nbuckets_optimal * sizeof(HashJoinTuple) + > hashtable->spaceAllowed) + break; + } + + /* + * if we didn't load anything and it is a FOJ/LOJ fallback batch, we will + * transition to emit unmatched outer tuples next. we want to know how + * many tuples were in the batch in that case, so don't zero it out then + */ + + /* + * if we loaded anything into the hashtable or it is the phantom stripe, + * must proceed to probing + */ + if (loaded_inner) + { + hjstate->hj_CurNumOuterTuples = 0; + InstrIncrBatchStripes(hashtable->fallback_batches_stats, curbatch); + return true; + } + + if (IsHashloopFallback(hashtable) && HJ_FILL_OUTER(hjstate)) + { + /* + * if we didn't load anything and it is a fallback batch, we will + * prepare to emit outer tuples during the phantom stripe probing + */ + hashtable->curstripe = PHANTOM_STRIPE; + hjstate->hj_EmitOuterTupleId = 0; + hjstate->hj_CurOuterMatchStatus = 0; + BufFileSeek(hashtable->hashloopBatchFile[curbatch], 0, 0, SEEK_SET); + if (hashtable->outerBatchFile[curbatch]) + BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET); + return true; + } + return false; } + /* * Choose a batch to work on, and attach to it. Returns true if successful, * false if there are no more batches. @@ -1101,11 +1425,24 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) /* * If we were already attached to a batch, remember not to bother checking * it again, and detach from it (possibly freeing the hash table if we are - * last to detach). + * last to detach). curbatch is set when the batch_barrier phase is either + * PHJ_BATCH_LOADING or PHJ_BATCH_STRIPING (note that the + * PHJ_BATCH_LOADING case will fall through to the PHJ_BATCH_STRIPING + * case). The PHJ_BATCH_STRIPING case returns to the caller. So when this + * function is reentered with a curbatch >= 0 then we must be done + * probing. */ + if (hashtable->curbatch >= 0) { - hashtable->batches[hashtable->curbatch].done = true; + ParallelHashJoinBatchAccessor *batch_accessor = &hashtable->batches[hashtable->curbatch]; + + if (IsHashloopFallback(hashtable)) + { + InstrAppendParallelBatchStripes(&hashtable->fallback_batches_stats, hashtable->curbatch, batch_accessor->shared->nstripes); + sb_end_write(hashtable->batches[hashtable->curbatch].sba); + } + batch_accessor->done = PHJ_BATCH_ACCESSOR_DONE; ExecHashTableDetachBatch(hashtable); } @@ -1119,13 +1456,8 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) hashtable->nbatch; do { - uint32 hashvalue; - MinimalTuple tuple; - TupleTableSlot *slot; - - if (!hashtable->batches[batchno].done) + if (hashtable->batches[batchno].done != PHJ_BATCH_ACCESSOR_DONE) { - SharedTuplestoreAccessor *inner_tuples; Barrier *batch_barrier = &hashtable->batches[batchno].shared->batch_barrier; @@ -1136,7 +1468,15 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) /* One backend allocates the hash table. */ if (BarrierArriveAndWait(batch_barrier, WAIT_EVENT_HASH_BATCH_ELECT)) + { ExecParallelHashTableAlloc(hashtable, batchno); + + /* + * one worker needs to 0 out the read_pages of all the + * participants in the new batch + */ + sts_reinitialize(hashtable->batches[batchno].inner_tuples); + } /* Fall through. */ case PHJ_BATCH_ALLOCATING: @@ -1145,41 +1485,31 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) WAIT_EVENT_HASH_BATCH_ALLOCATE); /* Fall through. */ - case PHJ_BATCH_LOADING: - /* Start (or join in) loading tuples. */ - ExecParallelHashTableSetCurrentBatch(hashtable, batchno); - inner_tuples = hashtable->batches[batchno].inner_tuples; - sts_begin_parallel_scan(inner_tuples); - while ((tuple = sts_parallel_scan_next(inner_tuples, - &hashvalue))) - { - ExecForceStoreMinimalTuple(tuple, - hjstate->hj_HashTupleSlot, - false); - slot = hjstate->hj_HashTupleSlot; - ExecParallelHashTableInsertCurrentBatch(hashtable, slot, - hashvalue); - } - sts_end_parallel_scan(inner_tuples); - BarrierArriveAndWait(batch_barrier, - WAIT_EVENT_HASH_BATCH_LOAD); - /* Fall through. */ + case PHJ_BATCH_STRIPING: - case PHJ_BATCH_PROBING: + ExecParallelHashTableSetCurrentBatch(hashtable, batchno); + sts_begin_parallel_scan(hashtable->batches[batchno].inner_tuples); + if (hashtable->batches[batchno].shared->hashloop_fallback) + sb_initialize_accessor(hashtable->batches[hashtable->curbatch].sba, + sts_get_tuplenum(hashtable->batches[hashtable->curbatch].outer_tuples)); + hashtable->curstripe = STRIPE_DETACHED; + if (ExecParallelHashJoinLoadStripe(hjstate)) + return true; /* - * This batch is ready to probe. Return control to - * caller. We stay attached to batch_barrier so that the - * hash table stays alive until everyone's finished - * probing it, but no participant is allowed to wait at - * this barrier again (or else a deadlock could occur). - * All attached participants must eventually call - * BarrierArriveAndDetach() so that the final phase - * PHJ_BATCH_DONE can be reached. + * ExecParallelHashJoinLoadStripe() will return false from + * here when no more work can be done by this worker on + * this batch. Until further optimized, this worker will + * have detached from the stripe_barrier and should close + * its outer match statuses bitmap and then detach from + * the batch. In order to reuse the code below, fall + * through, even though the phase will not have been + * advanced */ - ExecParallelHashTableSetCurrentBatch(hashtable, batchno); - sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples); - return true; + if (hashtable->batches[batchno].shared->hashloop_fallback) + sb_end_write(hashtable->batches[batchno].sba); + + /* Fall through. */ case PHJ_BATCH_DONE: @@ -1187,8 +1517,16 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) * Already done. Detach and go around again (if any * remain). */ + + /* + * In case the leader joins late, we have to make sure + * that all workers have the final number of stripes. + */ + if (hashtable->batches[batchno].shared->hashloop_fallback) + InstrAppendParallelBatchStripes(&hashtable->fallback_batches_stats, batchno, hashtable->batches[batchno].shared->nstripes); BarrierDetach(batch_barrier); - hashtable->batches[batchno].done = true; + hashtable->batches[batchno].done = PHJ_BATCH_ACCESSOR_DONE; + hashtable->curbatch = -1; break; @@ -1203,6 +1541,244 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) return false; } + + +/* + * Returns true if ready to probe and false if the inner is exhausted + * (there are no more stripes) + */ +bool +ExecParallelHashJoinLoadStripe(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int batchno = hashtable->curbatch; + ParallelHashJoinBatchAccessor *batch_accessor = &(hashtable->batches[batchno]); + ParallelHashJoinBatch *batch = batch_accessor->shared; + Barrier *stripe_barrier = &batch->stripe_barrier; + SharedTuplestoreAccessor *outer_tuples; + SharedTuplestoreAccessor *inner_tuples; + + outer_tuples = hashtable->batches[batchno].outer_tuples; + inner_tuples = hashtable->batches[batchno].inner_tuples; + + if (hashtable->curstripe >= 0) + { + /* + * If a worker is already attached to a stripe, wait until all + * participants have finished probing and detach. The last worker, + * however, can re-attach to the stripe_barrier and proceed to load + * and probe the other stripes + * + * After finishing with participating in a stripe, if a worker is the + * only one working on a batch, it will continue working on it. + * However, if a worker is not the only worker working on a batch, it + * would risk deadlock if it waits on the barrier. Instead, it will + * detach from the stripe, and, eventually the batch. + * + * This means all stripes after the first stripe will be executed + * serially. TODO: allow workers to provisionally detach from the + * batch and reattach later if there is still work to be done. I had a + * patch that did this. Workers who were not the last worker saved the + * state of the stripe barrier upon detaching and then mark the batch + * as "provisionally" done (not done). Later, when the worker comes + * back to the batch in the batch phase machine, if the batch is not + * complete and the phase has advanced since the worker was last + * participating, then the worker can join back in. This had problems. + * There were synchronization issues with workers having multiple + * outer match status bitmap files open at the same time, so, I had + * workers close their bitmap and make a new one the next time they + * joined in. This didn't work with the current code because the + * original outer match status bitmap file that the worker had created + * while probing stripe 1 did not get combined into the combined + * bitmap This could be specifically fixed, but I think it is better + * to address the lack of parallel execution for stripes after stripe + * 0 more holistically. + */ + if (!BarrierArriveAndDetach(stripe_barrier)) + { + sb_end_write(batch_accessor->sba); + hashtable->curstripe = STRIPE_DETACHED; + return false; + } + + /* + * This isn't a race condition if no other workers can stay attached + * to this barrier in the intervening time. Basically, if you attach + * to a stripe barrier in the PHJ_STRIPE_DONE phase, detach + * immediately and move on. + */ + BarrierAttach(stripe_barrier); + } + else if (hashtable->curstripe == STRIPE_DETACHED) + { + int phase = BarrierAttach(stripe_barrier); + + /* + * If a worker enters this phase machine for the first time for this + * batch on a stripe number greater than the batch's maximum stripe + * number, then: 1) The batch is done, or 2) The batch is on the + * phantom stripe that's used for hashloop fallback. Either way the + * worker can't contribute, so it will just detach and move on. + */ + if (PHJ_STRIPE_NUMBER(phase) > batch->nstripes || + PHJ_STRIPE_PHASE(phase) == PHJ_STRIPE_DONE) + return ExecHashTableDetachStripe(hashtable); + } + else if (hashtable->curstripe == PHANTOM_STRIPE) + { + /* Only the last worker will execute this code. */ + sts_end_parallel_scan(outer_tuples); + + /* + * TODO: ideally this would go somewhere in the batch phase machine + * Putting it in ExecHashTableDetachBatch didn't do the trick + */ + sb_end_read(batch_accessor->sba); + return ExecHashTableDetachStripe(hashtable); + } + + hashtable->curstripe = PHJ_STRIPE_NUMBER(BarrierPhase(stripe_barrier)); + + /* + * The outer side is exhausted and either 1) the current stripe of the + * inner side is exhausted and it is time to advance the stripe 2) the + * last stripe of the inner side is exhausted and it is time to advance + * the batch + */ + for (;;) + { + MinimalTuple tuple; + tupleMetadata metadata; + + bool overflow_required = false; + int phase = BarrierPhase(stripe_barrier); + + switch (PHJ_STRIPE_PHASE(phase)) + { + case PHJ_STRIPE_ELECTING: + if (BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_ELECT)) + sts_reinitialize(outer_tuples); + /* FALLTHROUGH */ + case PHJ_STRIPE_RESETTING: + + /* + * This barrier allows the elected worker to finish resetting + * the read_page for the outer side as well as allowing the + * worker which was elected to clear out the hashtable from + * the last stripe to finish. + */ + BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_RESET); + /* FALLTHROUGH */ + case PHJ_STRIPE_LOADING: + + /* + * Start (or join in) loading the next stripe of inner tuples. + */ + sts_begin_parallel_scan(inner_tuples); + + /* + * TODO: add functionality to pre-alloc some memory before + * calling sts_parallel_scan_next() because that will reserve + * an additional STS_CHUNK for every stripe for each worker + * that won't fit, so we should first see if the chunk would + * fit before getting the assignment + */ + while ((tuple = sts_parallel_scan_next(inner_tuples, &metadata))) + { + ExecForceStoreMinimalTuple(tuple, hjstate->hj_HashTupleSlot, false); + if (!ExecParallelHashTableInsertCurrentBatch(hashtable, hjstate->hj_HashTupleSlot, metadata.hashvalue, sta_get_read_participant(inner_tuples))) + { + overflow_required = true; + pg_atomic_test_set_flag(&batch->overflow_required); + break; + } + } + + if (BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_LOAD)) + { + if (!pg_atomic_unlocked_test_flag(&batch->overflow_required)) + batch->nstripes++; + } + /* FALLTHROUGH */ + case PHJ_STRIPE_OVERFLOWING: + if (overflow_required) + { + Assert(tuple); + sts_spill_leftover_tuples(inner_tuples, tuple, metadata.hashvalue); + } + BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_OVERFLOW); + + /* FALLTHROUGH */ + case PHJ_STRIPE_PROBING: + { + /* + * do this again here in case a worker began the scan and + * then entered after loading before probing + */ + sts_end_parallel_scan(inner_tuples); + sts_begin_parallel_scan(outer_tuples); + return true; + } + + case PHJ_STRIPE_DONE: + if (PHJ_STRIPE_NUMBER(phase) >= batch->nstripes) + { + /* + * Handle the phantom stripe case. + */ + if (batch->hashloop_fallback && HJ_FILL_OUTER(hjstate)) + goto fallback_stripe; + + /* Return if this is the last stripe */ + return ExecHashTableDetachStripe(hashtable); + } + + /* this, effectively, increments the stripe number */ + if (BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_LOAD)) + { + ExecParallelHashTableRecycle(hashtable); + pg_atomic_clear_flag(&batch->overflow_required); + } + + hashtable->curstripe++; + continue; + + default: + elog(ERROR, "unexpected stripe phase %d. pid %i. batch %i.", BarrierPhase(stripe_barrier), MyProcPid, batchno); + } + } + +fallback_stripe: + sb_end_write(batch_accessor->sba); + + /* Ensure that only a single worker is attached to the barrier */ + if (!BarrierArriveAndWait(stripe_barrier, WAIT_EVENT_HASH_STRIPE_LOAD)) + return ExecHashTableDetachStripe(hashtable); + + /* No one except the last worker will run this code */ + hashtable->curstripe = PHANTOM_STRIPE; + + ExecParallelHashTableRecycle(hashtable); + pg_atomic_clear_flag(&batch->overflow_required); + + /* + * If all workers (including this one) have finished probing the batch, + * one worker is elected to Loop through the outer match status files from + * all workers that were attached to this batch Combine them into one + * bitmap Use the bitmap, loop through the outer batch file again, and + * emit unmatched tuples All workers will detach from the batch barrier + * and the last worker will clean up the hashtable. All workers except the + * last worker will end their scans of the outer and inner side. The last + * worker will end its scan of the inner side + */ + sb_combine(batch_accessor->sba); + sts_reinitialize(outer_tuples); + + sts_begin_parallel_scan(outer_tuples); + + return true; +} + /* * ExecHashJoinSaveTuple * save a tuple to a batch file. @@ -1364,6 +1940,9 @@ ExecReScanHashJoin(HashJoinState *node) node->hj_MatchedOuter = false; node->hj_FirstOuterTupleSlot = NULL; + node->hj_CurNumOuterTuples = 0; + node->hj_CurOuterMatchStatus = 0; + /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. @@ -1394,7 +1973,6 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) ExprContext *econtext = hjstate->js.ps.ps_ExprContext; HashJoinTable hashtable = hjstate->hj_HashTable; TupleTableSlot *slot; - uint32 hashvalue; int i; Assert(hjstate->hj_FirstOuterTupleSlot == NULL); @@ -1402,6 +1980,8 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) /* Execute outer plan, writing all tuples to shared tuplestores. */ for (;;) { + tupleMetadata metadata; + slot = ExecProcNode(outerState); if (TupIsNull(slot)) break; @@ -1410,17 +1990,25 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) hjstate->hj_OuterHashKeys, true, /* outer tuple */ HJ_FILL_OUTER(hjstate), - &hashvalue)) + &metadata.hashvalue)) { int batchno; int bucketno; bool shouldFree; + SharedTuplestoreAccessor *accessor; + MinimalTuple mintup = ExecFetchSlotMinimalTuple(slot, &shouldFree); - ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, + ExecHashGetBucketAndBatch(hashtable, metadata.hashvalue, &bucketno, &batchno); + accessor = hashtable->batches[batchno].outer_tuples; + + /* cannot count on deterministic order of tupleids */ + metadata.tupleid = sts_increment_ntuples(accessor); + sts_puttuple(hashtable->batches[batchno].outer_tuples, - &hashvalue, mintup); + &metadata.hashvalue, + mintup); if (shouldFree) heap_free_minimal_tuple(mintup); @@ -1481,6 +2069,8 @@ ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) LWLockInitialize(&pstate->lock, LWTRANCHE_PARALLEL_HASH_JOIN); BarrierInit(&pstate->build_barrier, 0); + BarrierInit(&pstate->eviction_barrier, 0); + BarrierInit(&pstate->repartition_barrier, 0); BarrierInit(&pstate->grow_batches_barrier, 0); BarrierInit(&pstate->grow_buckets_barrier, 0); diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 8116b23614..e6643ad66c 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3779,8 +3779,20 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_HASH_BATCH_ELECT: event_name = "HashBatchElect"; break; - case WAIT_EVENT_HASH_BATCH_LOAD: - event_name = "HashBatchLoad"; + case WAIT_EVENT_HASH_STRIPE_ELECT: + event_name = "HashStripeElect"; + break; + case WAIT_EVENT_HASH_STRIPE_RESET: + event_name = "HashStripeReset"; + break; + case WAIT_EVENT_HASH_STRIPE_LOAD: + event_name = "HashStripeLoad"; + break; + case WAIT_EVENT_HASH_STRIPE_OVERFLOW: + event_name = "HashStripeOverflow"; + break; + case WAIT_EVENT_HASH_STRIPE_PROBE: + event_name = "HashStripeProbe"; break; case WAIT_EVENT_HASH_BUILD_ALLOCATE: event_name = "HashBuildAllocate"; @@ -3794,6 +3806,21 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_HASH_BUILD_HASH_OUTER: event_name = "HashBuildHashOuter"; break; + case WAIT_EVENT_HASH_EVICT_ELECT: + event_name = "HashEvictElect"; + break; + case WAIT_EVENT_HASH_EVICT_RESET: + event_name = "HashEvictReset"; + break; + case WAIT_EVENT_HASH_EVICT_SPILL: + event_name = "HashEvictSpill"; + break; + case WAIT_EVENT_HASH_EVICT_FINISH: + event_name = "HashEvictFinish"; + break; + case WAIT_EVENT_HASH_REPARTITION_BATCH0_DRAIN_QUEUE: + event_name = "HashRepartitionBatch0DrainQueue"; + break; case WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE: event_name = "HashGrowBatchesAllocate"; break; diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile index 7ac3659261..f11fe85aeb 100644 --- a/src/backend/utils/sort/Makefile +++ b/src/backend/utils/sort/Makefile @@ -16,6 +16,7 @@ override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) OBJS = \ logtape.o \ + sharedbits.o \ sharedtuplestore.o \ sortsupport.o \ tuplesort.o \ diff --git a/src/backend/utils/sort/sharedbits.c b/src/backend/utils/sort/sharedbits.c new file mode 100644 index 0000000000..be7000b08c --- /dev/null +++ b/src/backend/utils/sort/sharedbits.c @@ -0,0 +1,288 @@ +#include "postgres.h" + +#include + +#include "storage/buffile.h" +#include "utils/sharedbits.h" + +/* + * TODO: put a comment about not currently supporting parallel scan of the SharedBits + * To support parallel scan, need to introduce many more mechanisms + */ + +/* Per-participant shared state */ +struct SharedBitsParticipant +{ + bool present; + bool writing; +}; + +/* Shared control object */ +struct SharedBits +{ + int nparticipants; /* Number of participants that can write. */ + int64 nbits; + char name[NAMEDATALEN]; /* A name for this bitstore. */ + + SharedBitsParticipant participants[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* backend-local state */ +struct SharedBitsAccessor +{ + int participant; + SharedBits *bits; + SharedFileSet *fileset; + BufFile *write_file; + BufFile *combined; +}; + +SharedBitsAccessor * +sb_attach(SharedBits *sbits, int my_participant_number, SharedFileSet *fileset) +{ + SharedBitsAccessor *accessor = palloc0(sizeof(SharedBitsAccessor)); + + accessor->participant = my_participant_number; + accessor->bits = sbits; + accessor->fileset = fileset; + accessor->write_file = NULL; + accessor->combined = NULL; + return accessor; +} + +SharedBitsAccessor * +sb_initialize(SharedBits *sbits, + int participants, + int my_participant_number, + SharedFileSet *fileset, + char *name) +{ + SharedBitsAccessor *accessor; + + sbits->nparticipants = participants; + strcpy(sbits->name, name); + sbits->nbits = 0; /* TODO: maybe delete this */ + + accessor = palloc0(sizeof(SharedBitsAccessor)); + accessor->participant = my_participant_number; + accessor->bits = sbits; + accessor->fileset = fileset; + accessor->write_file = NULL; + accessor->combined = NULL; + return accessor; +} + +/* TODO: is "initialize_accessor" a clear enough API for this? (making the file)? */ +void +sb_initialize_accessor(SharedBitsAccessor *accessor, uint32 nbits) +{ + char name[MAXPGPATH]; + uint32 num_to_write; + + snprintf(name, MAXPGPATH, "%s.p%d.bitmap", accessor->bits->name, accessor->participant); + + accessor->write_file = + BufFileCreateShared(accessor->fileset, name); + + accessor->bits->participants[accessor->participant].present = true; + /* TODO: check this math. tuplenumber will be too high? */ + num_to_write = nbits / 8 + 1; + + /* + * TODO: add tests that could exercise a problem with junk being written + * to bitmap + */ + + /* + * TODO: is there a better way to write the bytes to the file without + * calling BufFileWrite() like this? palloc()ing an undetermined number of + * bytes feels like it is against the spirit of this patch to begin with, + * but the many function calls seem expensive + */ + for (int i = 0; i < num_to_write; i++) + { + unsigned char byteToWrite = 0; + + BufFileWrite(accessor->write_file, &byteToWrite, 1); + } + + if (BufFileSeek(accessor->write_file, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); +} + +size_t +sb_estimate(int participants) +{ + return offsetof(SharedBits, participants) + participants * sizeof(SharedBitsParticipant); +} + + +void +sb_setbit(SharedBitsAccessor *accessor, uint64 bit) +{ + SharedBitsParticipant *const participant = + &accessor->bits->participants[accessor->participant]; + + /* TODO: use an unsigned int instead of a byte */ + unsigned char current_outer_byte; + + Assert(accessor->write_file); + + if (!participant->writing) + { + participant->writing = true; + } + + BufFileSeek(accessor->write_file, 0, bit / 8, SEEK_SET); + BufFileRead(accessor->write_file, ¤t_outer_byte, 1); + + current_outer_byte |= 1U << (bit % 8); + + BufFileSeek(accessor->write_file, 0, -1, SEEK_CUR); + BufFileWrite(accessor->write_file, ¤t_outer_byte, 1); +} + +bool +sb_checkbit(SharedBitsAccessor *accessor, uint32 n) +{ + bool match; + uint32 bytenum = n / 8; + unsigned char bit = n % 8; + unsigned char byte_to_check = 0; + + Assert(accessor->combined); + + /* seek to byte to check */ + if (BufFileSeek(accessor->combined, + 0, + bytenum, + SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg( + "could not rewind shared outer temporary file: %m"))); + /* read byte containing ntuple bit */ + if (BufFileRead(accessor->combined, &byte_to_check, 1) == 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg( + "could not read byte in outer match status bitmap: %m."))); + /* if bit is set */ + match = ((byte_to_check) >> bit) & 1; + + return match; +} + +BufFile * +sb_combine(SharedBitsAccessor *accessor) +{ + /* + * TODO: this tries to close an outer match status file for each + * participant in the tuplestore. technically, only participants in the + * barrier could have outer match status files, however, all but one + * participant continue on and detach from the barrier so we won't have a + * reliable way to close only files for those attached to the barrier + */ + BufFile **statuses; + BufFile *combined_bitmap_file; + int statuses_length; + + int nbparticipants = 0; + + for (int l = 0; l < accessor->bits->nparticipants; l++) + { + SharedBitsParticipant participant = accessor->bits->participants[l]; + + if (participant.present) + { + Assert(!participant.writing); + nbparticipants++; + } + } + statuses = palloc(sizeof(BufFile *) * nbparticipants); + + /* + * Open the bitmap shared BufFile from each participant. TODO: explain why + * file can be NULLs + */ + statuses_length = 0; + + for (int i = 0; i < accessor->bits->nparticipants; i++) + { + char bitmap_filename[MAXPGPATH]; + BufFile *file; + + /* TODO: make a function that will do this */ + snprintf(bitmap_filename, MAXPGPATH, "%s.p%d.bitmap", accessor->bits->name, i); + + if (!accessor->bits->participants[i].present) + continue; + file = BufFileOpenShared(accessor->fileset, bitmap_filename, O_RDWR); + /* TODO: can we be sure that this file is at beginning? */ + Assert(file); + + statuses[statuses_length++] = file; + } + + combined_bitmap_file = BufFileCreateTemp(false); + + for (int64 cur = 0; cur < BufFileSize(statuses[0]); cur++) /* make it while not EOF */ + { + /* + * TODO: make this use an unsigned int instead of a byte so it isn't + * so slow + */ + unsigned char combined_byte = 0; + + for (int i = 0; i < statuses_length; i++) + { + unsigned char read_byte; + + BufFileRead(statuses[i], &read_byte, 1); + combined_byte |= read_byte; + } + + BufFileWrite(combined_bitmap_file, &combined_byte, 1); + } + + if (BufFileSeek(combined_bitmap_file, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + for (int i = 0; i < statuses_length; i++) + BufFileClose(statuses[i]); + pfree(statuses); + + accessor->combined = combined_bitmap_file; + return combined_bitmap_file; +} + +void +sb_end_write(SharedBitsAccessor *sba) +{ + SharedBitsParticipant + *const participant = &sba->bits->participants[sba->participant]; + + participant->writing = false; + + /* + * TODO: this should not be needed if flow is correct. need to fix that + * and get rid of this check + */ + if (sba->write_file) + BufFileClose(sba->write_file); + sba->write_file = NULL; +} + +void +sb_end_read(SharedBitsAccessor *accessor) +{ + if (accessor->combined == NULL) + return; + + BufFileClose(accessor->combined); + accessor->combined = NULL; +} diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index b83fb50dac..cb5d950676 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -47,19 +47,28 @@ typedef struct SharedTuplestoreChunk char data[FLEXIBLE_ARRAY_MEMBER]; } SharedTuplestoreChunk; +typedef enum SharedTuplestoreMode +{ + WRITING = 0, + READING = 1, + APPENDING = 2 +} SharedTuplestoreMode; + /* Per-participant shared state. */ typedef struct SharedTuplestoreParticipant { LWLock lock; BlockNumber read_page; /* Page number for next read. */ + bool rewound; BlockNumber npages; /* Number of pages written. */ - bool writing; /* Used only for assertions. */ + SharedTuplestoreMode mode; /* Used only for assertions. */ } SharedTuplestoreParticipant; /* The control object that lives in shared memory. */ struct SharedTuplestore { int nparticipants; /* Number of participants that can write. */ + pg_atomic_uint32 ntuples; /* Number of tuples in this tuplestore. */ int flags; /* Flag bits from SHARED_TUPLESTORE_XXX */ size_t meta_data_size; /* Size of per-tuple header. */ char name[NAMEDATALEN]; /* A name for this tuplestore. */ @@ -92,6 +101,8 @@ struct SharedTuplestoreAccessor BlockNumber write_page; /* The next page to write to. */ char *write_pointer; /* Current write pointer within chunk. */ char *write_end; /* One past the end of the current chunk. */ + bool participated; /* Did the worker participate in writing this + * STS at any point */ }; static void sts_filename(char *name, SharedTuplestoreAccessor *accessor, @@ -137,6 +148,7 @@ sts_initialize(SharedTuplestore *sts, int participants, Assert(my_participant_number < participants); sts->nparticipants = participants; + pg_atomic_init_u32(&sts->ntuples, 1); sts->meta_data_size = meta_data_size; sts->flags = flags; @@ -158,7 +170,8 @@ sts_initialize(SharedTuplestore *sts, int participants, LWLockInitialize(&sts->participants[i].lock, LWTRANCHE_SHARED_TUPLESTORE); sts->participants[i].read_page = 0; - sts->participants[i].writing = false; + sts->participants[i].rewound = false; + sts->participants[i].mode = READING; } accessor = palloc0(sizeof(SharedTuplestoreAccessor)); @@ -188,6 +201,7 @@ sts_attach(SharedTuplestore *sts, accessor->sts = sts; accessor->fileset = fileset; accessor->context = CurrentMemoryContext; + accessor->participated = false; return accessor; } @@ -219,7 +233,9 @@ sts_end_write(SharedTuplestoreAccessor *accessor) pfree(accessor->write_chunk); accessor->write_chunk = NULL; accessor->write_file = NULL; - accessor->sts->participants[accessor->participant].writing = false; + accessor->write_pointer = NULL; + accessor->write_end = NULL; + accessor->sts->participants[accessor->participant].mode = READING; } } @@ -263,7 +279,7 @@ sts_begin_parallel_scan(SharedTuplestoreAccessor *accessor) * files have stopped growing. */ for (i = 0; i < accessor->sts->nparticipants; ++i) - Assert(!accessor->sts->participants[i].writing); + Assert((accessor->sts->participants[i].mode == READING) || (accessor->sts->participants[i].mode == APPENDING)); /* * We will start out reading the file that THIS backend wrote. There may @@ -311,10 +327,11 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data, /* Create one. Only this backend will write into it. */ sts_filename(name, accessor, accessor->participant); accessor->write_file = BufFileCreateShared(accessor->fileset, name); + accessor->participated = true; /* Set up the shared state for this backend's file. */ participant = &accessor->sts->participants[accessor->participant]; - participant->writing = true; /* for assertions only */ + participant->mode = WRITING; /* for assertions only */ } /* Do we have space? */ @@ -513,6 +530,17 @@ sts_read_tuple(SharedTuplestoreAccessor *accessor, void *meta_data) return tuple; } +MinimalTuple +sts_parallel_scan_chunk(SharedTuplestoreAccessor *accessor, + void *meta_data, + bool inner) +{ + Assert(accessor->read_file); + if (accessor->read_ntuples < accessor->read_ntuples_available) + return sts_read_tuple(accessor, meta_data); + return NULL; +} + /* * Get the next tuple in the current parallel scan. */ @@ -526,7 +554,13 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data) for (;;) { /* Can we read more tuples from the current chunk? */ - if (accessor->read_ntuples < accessor->read_ntuples_available) + /* + * Added a check for accessor->read_file being present here, as it + * became relevant for adaptive hashjoin. TODO: Not sure if this has + * other consequences for correctness + */ + + if (accessor->read_ntuples < accessor->read_ntuples_available && accessor->read_file) return sts_read_tuple(accessor, meta_data); /* Find the location of a new chunk to read. */ @@ -618,6 +652,56 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data) return NULL; } +uint32 +sts_increment_ntuples(SharedTuplestoreAccessor *accessor) +{ + return pg_atomic_fetch_add_u32(&accessor->sts->ntuples, 1); +} + +uint32 +sts_get_tuplenum(SharedTuplestoreAccessor *accessor) +{ + return pg_atomic_read_u32(&accessor->sts->ntuples); +} + +int +sta_get_read_participant(SharedTuplestoreAccessor *accessor) +{ + return accessor->read_participant; +} + +void +sts_spill_leftover_tuples(SharedTuplestoreAccessor *accessor, MinimalTuple tuple, uint32 hashvalue) +{ + tupleMetadata metadata; + SharedTuplestoreParticipant *participant; + char name[MAXPGPATH]; + + metadata.hashvalue = hashvalue; + participant = &accessor->sts->participants[accessor->participant]; + participant->mode = APPENDING; /* for assertions only */ + + sts_filename(name, accessor, accessor->participant); + if (!accessor->participated) + { + accessor->write_file = BufFileCreateShared(accessor->fileset, name); + accessor->participated = true; + } + + else + accessor->write_file = BufFileOpenShared(accessor->fileset, name, O_WRONLY); + + BufFileSeek(accessor->write_file, 0, -1, SEEK_END); + do + { + sts_puttuple(accessor, &metadata, tuple); + } while ((tuple = sts_parallel_scan_chunk(accessor, &metadata, true))); + + accessor->read_ntuples = 0; + accessor->read_ntuples_available = 0; + sts_end_write(accessor); +} + /* * Create the name used for the BufFile that a given participant will write. */ diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index ba661d32a6..0ba9d856c8 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -46,6 +46,7 @@ typedef struct ExplainState bool timing; /* print detailed node timing */ bool summary; /* print total planning and execution timing */ bool settings; /* print modified settings */ + bool usage; /* print memory usage */ ExplainFormat format; /* output format */ /* state for output formatting --- not reset for each new plan tree */ int indent; /* current indentation level */ diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index eb5daba36b..e9354cc6e0 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -19,6 +19,7 @@ #include "storage/barrier.h" #include "storage/buffile.h" #include "storage/lwlock.h" +#include "utils/sharedbits.h" /* ---------------------------------------------------------------- * hash-join hash table structures @@ -142,6 +143,17 @@ typedef struct HashMemoryChunkData *HashMemoryChunk; /* tuples exceeding HASH_CHUNK_THRESHOLD bytes are put in their own chunk */ #define HASH_CHUNK_THRESHOLD (HASH_CHUNK_SIZE / 4) +/* + * HashJoinTableData->curstripe the current stripe number + * The phantom stripe refers to the state of the inner side hashtable (empty) + * during the final scan of the outer batch file for a batch being processed + * using the hashloop fallback algorithm. + * In parallel-aware hash join, curstripe is in a detached state + * when the worker is not attached to the stripe_barrier. + */ +#define PHANTOM_STRIPE -2 +#define STRIPE_DETACHED -1 + /* * For each batch of a Parallel Hash Join, we have a ParallelHashJoinBatch * object in shared memory to coordinate access to it. Since they are @@ -152,14 +164,34 @@ typedef struct ParallelHashJoinBatch { dsa_pointer buckets; /* array of hash table buckets */ Barrier batch_barrier; /* synchronization for joining this batch */ + Barrier stripe_barrier; /* synchronization for stripes */ dsa_pointer chunks; /* chunks of tuples loaded */ size_t size; /* size of buckets + chunks in memory */ size_t estimated_size; /* size of buckets + chunks while writing */ - size_t ntuples; /* number of tuples loaded */ + /* total number of tuples loaded into batch (in memory and spill files) */ + size_t ntuples; size_t old_ntuples; /* number of tuples before repartitioning */ bool space_exhausted; + /* Adaptive HashJoin */ + + /* + * after finishing build phase, hashloop_fallback cannot change, and does + * not require a lock to read + */ + pg_atomic_flag overflow_required; + bool hashloop_fallback; + int nstripes; /* the number of stripes in the batch */ + /* number of tuples loaded into the hashtable */ + pg_atomic_uint64 ntuples_in_memory; + + /* + * Note that ntuples will reflect the total number of tuples in the batch + * while ntuples_in_memory will reflect how many tuples are in memory + */ + LWLock lock; + /* * Variable-sized SharedTuplestore objects follow this struct in memory. * See the accessor macros below. @@ -177,10 +209,17 @@ typedef struct ParallelHashJoinBatch ((char *) ParallelHashJoinBatchInner(batch) + \ MAXALIGN(sts_estimate(nparticipants)))) +/* Accessor for sharedbits following a ParallelHashJoinBatch. */ +#define ParallelHashJoinBatchOuterBits(batch, nparticipants) \ + ((SharedBits *) \ + ((char *) ParallelHashJoinBatchOuter(batch, nparticipants) + \ + MAXALIGN(sts_estimate(nparticipants)))) + /* Total size of a ParallelHashJoinBatch and tuplestores. */ #define EstimateParallelHashJoinBatch(hashtable) \ (MAXALIGN(sizeof(ParallelHashJoinBatch)) + \ - MAXALIGN(sts_estimate((hashtable)->parallel_state->nparticipants)) * 2) + MAXALIGN(sts_estimate((hashtable)->parallel_state->nparticipants)) * 2 + \ + MAXALIGN(sb_estimate((hashtable)->parallel_state->nparticipants))) /* Accessor for the nth ParallelHashJoinBatch given the base. */ #define NthParallelHashJoinBatch(base, n) \ @@ -204,9 +243,19 @@ typedef struct ParallelHashJoinBatchAccessor size_t old_ntuples; /* how many tuples before repartitioning? */ bool at_least_one_chunk; /* has this backend allocated a chunk? */ - bool done; /* flag to remember that a batch is done */ + int done; /* flag to remember that a batch is done */ + /* -1 for not done, 0 for tentatively done, 1 for done */ SharedTuplestoreAccessor *inner_tuples; SharedTuplestoreAccessor *outer_tuples; + SharedBitsAccessor *sba; + + /* + * All participants except the last worker working on a batch which has + * fallen back to hashloop processing save the stripe barrier phase and + * detach to avoid the deadlock hazard of waiting on a barrier after + * tuples have been emitted. + */ + int last_participating_stripe_phase; } ParallelHashJoinBatchAccessor; /* @@ -223,10 +272,28 @@ typedef enum ParallelHashGrowth PHJ_GROWTH_NEED_MORE_BUCKETS, /* The memory budget would be exhausted, so we need to repartition. */ PHJ_GROWTH_NEED_MORE_BATCHES, - /* Repartitioning didn't help last time, so don't try to do that again. */ - PHJ_GROWTH_DISABLED + + /* + * While repartitioning or, if nbatches would overflow int, disable growth + * in the number of batches + */ + PHJ_GROWTH_DISABLED, + PHJ_GROWTH_SPILL_BATCH0, + PHJ_GROWTH_LOADING } ParallelHashGrowth; +typedef enum ParallelHashJoinBatchAccessorStatus +{ + /* No more useful work can be done on this batch by this worker */ + PHJ_BATCH_ACCESSOR_DONE, + + /* + * The worker has not yet checked this batch to see if it can do useful + * work + */ + PHJ_BATCH_ACCESSOR_NOT_DONE +} ParallelHashJoinBatchAccessorStatus; + /* * The shared state used to coordinate a Parallel Hash Join. This is stored * in the DSM segment. @@ -246,6 +313,8 @@ typedef struct ParallelHashJoinState LWLock lock; /* lock protecting the above */ Barrier build_barrier; /* synchronization for the build phases */ + Barrier eviction_barrier; + Barrier repartition_barrier; Barrier grow_batches_barrier; Barrier grow_buckets_barrier; pg_atomic_uint32 distributor; /* counter for load balancing */ @@ -263,9 +332,42 @@ typedef struct ParallelHashJoinState /* The phases for probing each batch, used by for batch_barrier. */ #define PHJ_BATCH_ELECTING 0 #define PHJ_BATCH_ALLOCATING 1 -#define PHJ_BATCH_LOADING 2 -#define PHJ_BATCH_PROBING 3 -#define PHJ_BATCH_DONE 4 +#define PHJ_BATCH_STRIPING 2 +#define PHJ_BATCH_DONE 3 + +/* The phases for probing each stripe of each batch used with stripe barriers */ +#define PHJ_STRIPE_INVALID_PHASE -1 +#define PHJ_STRIPE_ELECTING 0 +#define PHJ_STRIPE_RESETTING 1 +#define PHJ_STRIPE_LOADING 2 +#define PHJ_STRIPE_OVERFLOWING 3 +#define PHJ_STRIPE_PROBING 4 +#define PHJ_STRIPE_DONE 5 +#define PHJ_STRIPE_NUMBER(n) ((n) / 6) +#define PHJ_STRIPE_PHASE(n) ((n) % 6) + +#define PHJ_EVICT_ELECTING 0 +#define PHJ_EVICT_RESETTING 1 +#define PHJ_EVICT_SPILLING 2 +#define PHJ_EVICT_FINISHING 3 +#define PHJ_EVICT_DONE 4 +#define PHJ_EVICT_PHASE(n) ((n) % 5) + +/* + * These phases are now required for repartitioning batch 0 since it can + * spill. First all tuples which were resident in the hashtable need to + * be relocated either back to the hashtable or to a spill file, if they + * would relocate to a batch 1+ given the new number of batches. After + * draining the chunk_work_queue, we must drain the batch 0 spill file, + * if it exists. Some tuples may have been relocated from the hashtable + * to other batches, in which case, space may have been freed up which + * the tuples from the batch 0 spill file can occupy. The tuples from the + * batch 0 spill file may go to 1) the hashtable, 2) back to the batch 0 + * spill file in the new generation of batches, 3) to a batch file 1+ + */ +#define PHJ_REPARTITION_BATCH0_DRAIN_QUEUE 0 +#define PHJ_REPARTITION_BATCH0_DRAIN_SPILL_FILE 1 +#define PHJ_REPARTITION_BATCH0_PHASE(n) ((n) % 2) /* The phases of batch growth while hashing, for grow_batches_barrier. */ #define PHJ_GROW_BATCHES_ELECTING 0 @@ -313,8 +415,6 @@ typedef struct HashJoinTableData int nbatch_original; /* nbatch when we started inner scan */ int nbatch_outstart; /* nbatch when we started outer scan */ - bool growEnabled; /* flag to shut off nbatch increases */ - double totalTuples; /* # tuples obtained from inner plan */ double partialTuples; /* # tuples obtained from inner plan by me */ double skewTuples; /* # tuples inserted into skew tuples */ @@ -329,6 +429,18 @@ typedef struct HashJoinTableData BufFile **innerBatchFile; /* buffered virtual temp file per batch */ BufFile **outerBatchFile; /* buffered virtual temp file per batch */ + /* + * Adaptive hashjoin variables + */ + BufFile **hashloopBatchFile; /* outer match status files if fall back */ + List *fallback_batches_stats; /* per hashjoin batch statistics */ + + /* + * current stripe #; 0 during 1st pass, -1 (macro STRIPE_DETACHED) when + * detached, -2 on phantom stripe (macro PHANTOM_STRIPE) + */ + int curstripe; + /* * Info about the datatype-specific hash functions for the datatypes being * hashed. These are arrays of the same length as the number of hash join diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 9dc3ecb07d..839086005c 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -14,6 +14,7 @@ #define INSTRUMENT_H #include "portability/instr_time.h" +#include "nodes/pg_list.h" typedef struct BufferUsage @@ -39,6 +40,12 @@ typedef struct WalUsage uint64 wal_bytes; /* size of WAL records produced */ } WalUsage; +typedef struct FallbackBatchStats +{ + int batchno; + int numstripes; +} FallbackBatchStats; + /* Flag bits included in InstrAlloc's instrument_options bitmask */ typedef enum InstrumentOption { diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index 2db4e2f672..6d094e1a43 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -31,6 +31,7 @@ extern void ExecParallelHashTableAlloc(HashJoinTable hashtable, extern void ExecHashTableDestroy(HashJoinTable hashtable); extern void ExecHashTableDetach(HashJoinTable hashtable); extern void ExecHashTableDetachBatch(HashJoinTable hashtable); +extern bool ExecHashTableDetachStripe(HashJoinTable hashtable); extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno); @@ -40,9 +41,11 @@ extern void ExecHashTableInsert(HashJoinTable hashtable, extern void ExecParallelHashTableInsert(HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue); -extern void ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, +extern MinimalTuple + ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, TupleTableSlot *slot, - uint32 hashvalue); + uint32 hashvalue, + int read_participant); extern bool ExecHashGetHashValue(HashJoinTable hashtable, ExprContext *econtext, List *hashkeys, @@ -59,6 +62,8 @@ extern void ExecPrepHashTableForUnmatched(HashJoinState *hjstate); extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext); extern void ExecHashTableReset(HashJoinTable hashtable); +extern void + ExecParallelHashTableRecycle(HashJoinTable hashtable); extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable); extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, bool try_combined_hash_mem, diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h index f7df70b5ab..0c0d87d1d3 100644 --- a/src/include/executor/tuptable.h +++ b/src/include/executor/tuptable.h @@ -129,6 +129,7 @@ typedef struct TupleTableSlot MemoryContext tts_mcxt; /* slot itself is in this context */ ItemPointerData tts_tid; /* stored tuple's tid */ Oid tts_tableOid; /* table oid of tuple */ + uint32 tts_tuplenum; /* a tuple id for use when ctid cannot be used */ } TupleTableSlot; /* routines for a TupleTableSlot implementation */ @@ -425,6 +426,7 @@ static inline TupleTableSlot * ExecClearTuple(TupleTableSlot *slot) { slot->tts_ops->clear(slot); + slot->tts_tuplenum = 0; /* TODO: should this be done elsewhere? */ return slot; } diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0b42dd6f94..cb30e3bea1 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1959,6 +1959,10 @@ typedef struct HashJoinState int hj_JoinState; bool hj_MatchedOuter; bool hj_OuterNotEmpty; + /* Adaptive Hashjoin variables */ + int hj_CurNumOuterTuples; /* number of outer tuples in a batch */ + unsigned int hj_CurOuterMatchStatus; + int hj_EmitOuterTupleId; } HashJoinState; @@ -2387,6 +2391,7 @@ typedef struct HashInstrumentation int nbatch; /* number of batches at end of execution */ int nbatch_original; /* planned number of batches */ Size space_peak; /* peak memory usage in bytes */ + List *fallback_batches_stats; /* per hashjoin batch stats */ } HashInstrumentation; /* ---------------- diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 807a9c1edf..399c442171 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -855,11 +855,20 @@ typedef enum WAIT_EVENT_EXECUTE_GATHER, WAIT_EVENT_HASH_BATCH_ALLOCATE, WAIT_EVENT_HASH_BATCH_ELECT, - WAIT_EVENT_HASH_BATCH_LOAD, + WAIT_EVENT_HASH_STRIPE_ELECT, + WAIT_EVENT_HASH_STRIPE_RESET, + WAIT_EVENT_HASH_STRIPE_LOAD, + WAIT_EVENT_HASH_STRIPE_OVERFLOW, + WAIT_EVENT_HASH_STRIPE_PROBE, WAIT_EVENT_HASH_BUILD_ALLOCATE, WAIT_EVENT_HASH_BUILD_ELECT, WAIT_EVENT_HASH_BUILD_HASH_INNER, WAIT_EVENT_HASH_BUILD_HASH_OUTER, + WAIT_EVENT_HASH_EVICT_ELECT, + WAIT_EVENT_HASH_EVICT_RESET, + WAIT_EVENT_HASH_EVICT_SPILL, + WAIT_EVENT_HASH_EVICT_FINISH, + WAIT_EVENT_HASH_REPARTITION_BATCH0_DRAIN_QUEUE, WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE, WAIT_EVENT_HASH_GROW_BATCHES_DECIDE, WAIT_EVENT_HASH_GROW_BATCHES_ELECT, diff --git a/src/include/utils/sharedbits.h b/src/include/utils/sharedbits.h new file mode 100644 index 0000000000..de43279de8 --- /dev/null +++ b/src/include/utils/sharedbits.h @@ -0,0 +1,39 @@ +/*------------------------------------------------------------------------- + * + * sharedbits.h + * Simple mechanism for sharing bits between backends. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/sharedbits.h + * + *------------------------------------------------------------------------- + */ +#ifndef SHAREDBITS_H +#define SHAREDBITS_H + +#include "storage/sharedfileset.h" + +struct SharedBits; +typedef struct SharedBits SharedBits; + +struct SharedBitsParticipant; +typedef struct SharedBitsParticipant SharedBitsParticipant; + +struct SharedBitsAccessor; +typedef struct SharedBitsAccessor SharedBitsAccessor; + +extern SharedBitsAccessor *sb_attach(SharedBits *sbits, int my_participant_number, SharedFileSet *fileset); +extern SharedBitsAccessor *sb_initialize(SharedBits *sbits, int participants, int my_participant_number, SharedFileSet *fileset, char *name); +extern void sb_initialize_accessor(SharedBitsAccessor *accessor, uint32 nbits); +extern size_t sb_estimate(int participants); + +extern void sb_setbit(SharedBitsAccessor *accessor, uint64 bit); +extern bool sb_checkbit(SharedBitsAccessor *accessor, uint32 n); +extern BufFile *sb_combine(SharedBitsAccessor *accessor); + +extern void sb_end_write(SharedBitsAccessor *sba); +extern void sb_end_read(SharedBitsAccessor *accessor); + +#endif /* SHAREDBITS_H */ diff --git a/src/include/utils/sharedtuplestore.h b/src/include/utils/sharedtuplestore.h index 9754504cc5..5f8d95cb1a 100644 --- a/src/include/utils/sharedtuplestore.h +++ b/src/include/utils/sharedtuplestore.h @@ -22,6 +22,17 @@ typedef struct SharedTuplestore SharedTuplestore; struct SharedTuplestoreAccessor; typedef struct SharedTuplestoreAccessor SharedTuplestoreAccessor; +struct tupleMetadata; +typedef struct tupleMetadata tupleMetadata; +struct tupleMetadata +{ + uint32 hashvalue; + union + { + uint32 tupleid; /* tuple number or id on the outer side */ + int stripe; /* stripe number for inner side */ + }; +}; /* * A flag indicating that the tuplestore will only be scanned once, so backing @@ -58,4 +69,14 @@ extern void sts_puttuple(SharedTuplestoreAccessor *accessor, extern MinimalTuple sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data); +extern uint32 sts_increment_ntuples(SharedTuplestoreAccessor *accessor); +extern uint32 sts_get_tuplenum(SharedTuplestoreAccessor *accessor); +extern int sta_get_read_participant(SharedTuplestoreAccessor *accessor); +extern void sts_spill_leftover_tuples(SharedTuplestoreAccessor *accessor, MinimalTuple tuple, uint32 hashvalue); + +extern MinimalTuple sts_parallel_scan_chunk(SharedTuplestoreAccessor *accessor, + void *meta_data, + bool inner); + + #endif /* SHAREDTUPLESTORE_H */ diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out index 3a91c144a2..aa7477a299 100644 --- a/src/test/regress/expected/join_hash.out +++ b/src/test/regress/expected/join_hash.out @@ -839,45 +839,26 @@ rollback to settings; -- the hash table) -- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and -- sts_puttuple oversized tuple cases because it's multi-batch) -savepoint settings; -set max_parallel_workers_per_gather = 2; -set enable_parallel_hash = on; -set work_mem = '128kB'; -explain (costs off) - select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); - QUERY PLAN ----------------------------------------------------------------- - Finalize Aggregate - -> Gather - Workers Planned: 2 - -> Partial Aggregate - -> Parallel Hash Left Join - Hash Cond: (wide.id = wide_1.id) - -> Parallel Seq Scan on wide - -> Parallel Hash - -> Parallel Seq Scan on wide wide_1 -(9 rows) - -select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); - length --------- - 320000 -(1 row) - -select final > 1 as multibatch - from hash_join_batches( -$$ - select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); -$$); - multibatch ------------- - t -(1 row) - -rollback to settings; +-- savepoint settings; +-- set max_parallel_workers_per_gather = 2; +-- set enable_parallel_hash = on; +-- TODO: throw an error when this happens: cannot set work_mem lower than the side of a single tuple +-- TODO: ensure that oversize tuple code is still exercised (should be with some of the stub stuff below) +-- TODO: commented this out since it would crash otherwise +-- this test is no longer multi-batch, so, perhaps, it should be removed +-- set work_mem = '128kB'; +-- explain (costs off) +-- select length(max(s.t)) +-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +-- select length(max(s.t)) +-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +-- select final > 1 as multibatch +-- from hash_join_batches( +-- $$ +-- select length(max(s.t)) +-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +-- $$); +-- rollback to settings; rollback; -- Verify that hash key expressions reference the correct -- nodes. Hashjoin's hashkeys need to reference its outer plan, Hash's @@ -1013,3 +994,1968 @@ WHERE (1 row) ROLLBACK; +-- Serial Adaptive Hash Join +BEGIN; +CREATE TYPE stub AS (hash INTEGER, value CHAR(8090)); +CREATE FUNCTION stub_hash(item stub) +RETURNS INTEGER AS $$ +DECLARE + batch_size INTEGER; +BEGIN + batch_size := 4; + RETURN item.hash << (batch_size - 1); +END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE; +CREATE FUNCTION stub_eq(item1 stub, item2 stub) +RETURNS BOOLEAN AS $$ +BEGIN + RETURN item1.hash = item2.hash AND item1.value = item2.value; +END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE; +CREATE OPERATOR = ( + FUNCTION = stub_eq, + LEFTARG = stub, + RIGHTARG = stub, + COMMUTATOR = =, + HASHES, MERGES +); +CREATE OPERATOR CLASS stub_hash_ops +DEFAULT FOR TYPE stub USING hash AS + OPERATOR 1 =(stub, stub), + FUNCTION 1 stub_hash(stub); +CREATE TABLE probeside(a stub); +ALTER TABLE probeside ALTER COLUMN a SET STORAGE PLAIN; +-- non-fallback batch with unmatched outer tuple +INSERT INTO probeside SELECT '(2, "")' FROM generate_series(1, 1); +-- fallback batch unmatched outer tuple (in first stripe maybe) +INSERT INTO probeside SELECT '(1, "unmatched outer tuple")' FROM generate_series(1, 1); +-- fallback batch matched outer tuple +INSERT INTO probeside SELECT '(1, "")' FROM generate_series(1, 5); +-- fallback batch unmatched outer tuple (in last stripe maybe) +-- When numbatches=4, hash 5 maps to batch 1, but after numbatches doubles to +-- 8 batches hash 5 maps to batch 5. +INSERT INTO probeside SELECT '(5, "")' FROM generate_series(1, 1); +-- non-fallback batch matched outer tuple +INSERT INTO probeside SELECT '(3, "")' FROM generate_series(1, 1); +-- batch with 3 stripes where non-first/non-last stripe contains unmatched outer tuple +INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 5); +INSERT INTO probeside SELECT '(6, "unmatched outer tuple")' FROM generate_series(1, 1); +INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 1); +CREATE TABLE hashside_wide(a stub, id int); +ALTER TABLE hashside_wide ALTER COLUMN a SET STORAGE PLAIN; +-- falls back with an unmatched inner tuple that is in fist, middle, and last +-- stripe +INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in first stripe")', 1 FROM generate_series(1, 1); +INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9); +INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in middle stripe")', 1 FROM generate_series(1, 1); +INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9); +INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in last stripe")', 1 FROM generate_series(1, 1); +-- doesn't fall back -- matched tuple +INSERT INTO hashside_wide SELECT '(3, "")', 3 FROM generate_series(1, 1); +INSERT INTO hashside_wide SELECT '(6, "")', 6 FROM generate_series(1, 20); +ANALYZE probeside, hashside_wide; +SET enable_nestloop TO off; +SET enable_mergejoin TO off; +SET work_mem = 64; +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +LEFT OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; + hash | btrim | id | hash | btrim +------+-----------------------+----+------+------- + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | unmatched outer tuple | | | + 2 | | | | + 3 | | 3 | 3 | + 5 | | | | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | unmatched outer tuple | | | +(215 rows) + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +LEFT OUTER JOIN hashside_wide USING (a); + QUERY PLAN +---------------------------------------------------------------- + Hash Left Join (actual rows=215 loops=1) + Hash Cond: (probeside.a = hashside_wide.a) + -> Seq Scan on probeside (actual rows=16 loops=1) + -> Hash (actual rows=42 loops=1) + Buckets: 8 (originally 8) Batches: 32 (originally 8) + Batch: 1 Stripes: 3 + Batch: 6 Stripes: 3 + -> Seq Scan on hashside_wide (actual rows=42 loops=1) +(8 rows) + +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +RIGHT OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; + hash | btrim | id | hash | btrim +------+-------+----+------+---------------------------------------- + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 3 | | 3 | 3 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + | | 1 | 1 | unmatched inner tuple in first stripe + | | 1 | 1 | unmatched inner tuple in last stripe + | | 1 | 1 | unmatched inner tuple in middle stripe +(214 rows) + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +RIGHT OUTER JOIN hashside_wide USING (a); + QUERY PLAN +---------------------------------------------------------------- + Hash Right Join (actual rows=214 loops=1) + Hash Cond: (probeside.a = hashside_wide.a) + -> Seq Scan on probeside (actual rows=16 loops=1) + -> Hash (actual rows=42 loops=1) + Buckets: 8 (originally 8) Batches: 32 (originally 8) + Batch: 1 Stripes: 3 + Batch: 6 Stripes: 3 + -> Seq Scan on hashside_wide (actual rows=42 loops=1) +(8 rows) + +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +FULL OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; + hash | btrim | id | hash | btrim +------+-----------------------+----+------+---------------------------------------- + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | unmatched outer tuple | | | + 2 | | | | + 3 | | 3 | 3 | + 5 | | | | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | unmatched outer tuple | | | + | | 1 | 1 | unmatched inner tuple in first stripe + | | 1 | 1 | unmatched inner tuple in last stripe + | | 1 | 1 | unmatched inner tuple in middle stripe +(218 rows) + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +FULL OUTER JOIN hashside_wide USING (a); + QUERY PLAN +---------------------------------------------------------------- + Hash Full Join (actual rows=218 loops=1) + Hash Cond: (probeside.a = hashside_wide.a) + -> Seq Scan on probeside (actual rows=16 loops=1) + -> Hash (actual rows=42 loops=1) + Buckets: 8 (originally 8) Batches: 32 (originally 8) + Batch: 1 Stripes: 3 + Batch: 6 Stripes: 3 + -> Seq Scan on hashside_wide (actual rows=42 loops=1) +(8 rows) + +-- semi-join testcase +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) +SELECT probeside.* FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a); + QUERY PLAN +---------------------------------------------------------------- + Hash Semi Join (actual rows=12 loops=1) + Hash Cond: (probeside.a = hashside_wide.a) + -> Seq Scan on probeside (actual rows=16 loops=1) + -> Hash (actual rows=42 loops=1) + Buckets: 8 (originally 8) Batches: 32 (originally 8) + Batch: 1 Stripes: 3 + Batch: 6 Stripes: 3 + -> Seq Scan on hashside_wide (actual rows=42 loops=1) +(8 rows) + +SELECT (probeside.a).hash, TRIM((probeside.a).value) +FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2; + hash | btrim +------+------- + 1 | + 1 | + 1 | + 1 | + 1 | + 3 | + 6 | + 6 | + 6 | + 6 | + 6 | + 6 | +(12 rows) + +-- anti-join testcase +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) +SELECT probeside.* FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a); + QUERY PLAN +---------------------------------------------------------------- + Hash Anti Join (actual rows=4 loops=1) + Hash Cond: (probeside.a = hashside_wide.a) + -> Seq Scan on probeside (actual rows=16 loops=1) + -> Hash (actual rows=42 loops=1) + Buckets: 8 (originally 8) Batches: 32 (originally 8) + Batch: 1 Stripes: 3 + Batch: 6 Stripes: 3 + -> Seq Scan on hashside_wide (actual rows=42 loops=1) +(8 rows) + +SELECT (probeside.a).hash, TRIM((probeside.a).value) +FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2; + hash | btrim +------+----------------------- + 1 | unmatched outer tuple + 2 | + 5 | + 6 | unmatched outer tuple +(4 rows) + +-- parallel LOJ test case with two batches falling back +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_parallel_hash = on; +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +LEFT OUTER JOIN hashside_wide USING (a); + QUERY PLAN +------------------------------------------------------------------------------- + Gather (actual rows=215 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Parallel Hash Left Join (actual rows=108 loops=2) + Hash Cond: (probeside.a = hashside_wide.a) + -> Parallel Seq Scan on probeside (actual rows=16 loops=1) + -> Parallel Hash (actual rows=21 loops=2) + Buckets: 8 (originally 8) Batches: 128 (originally 8) + Batch: 1 Stripes: 3 + Batch: 6 Stripes: 3 + -> Parallel Seq Scan on hashside_wide (actual rows=42 loops=1) +(11 rows) + +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +LEFT OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; + hash | btrim | id | hash | btrim +------+-----------------------+----+------+------- + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | | 1 | 1 | + 1 | unmatched outer tuple | | | + 2 | | | | + 3 | | 3 | 3 | + 5 | | | | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | | 6 | 6 | + 6 | unmatched outer tuple | | | +(215 rows) + +rollback to settings; +-- Test spill of batch 0 gives correct results. +CREATE TABLE probeside_batch0(id int generated always as identity, a stub); +ALTER TABLE probeside_batch0 ALTER COLUMN a SET STORAGE PLAIN; +INSERT INTO probeside_batch0(a) SELECT '(0, "")' FROM generate_series(1, 13); +INSERT INTO probeside_batch0(a) SELECT '(0, "unmatched outer")' FROM generate_series(1, 1); +CREATE TABLE hashside_wide_batch0(id int generated always as identity, a stub); +ALTER TABLE hashside_wide_batch0 ALTER COLUMN a SET STORAGE PLAIN; +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); +ANALYZE probeside_batch0, hashside_wide_batch0; +SELECT + hashside_wide_batch0.id as hashside_id, + (hashside_wide_batch0.a).hash as hashside_hash, + probeside_batch0.id as probeside_id, + (probeside_batch0.a).hash as probeside_hash, + TRIM((probeside_batch0.a).value) as probeside_trimmed_value, + TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value +FROM probeside_batch0 +LEFT OUTER JOIN hashside_wide_batch0 USING (a) +ORDER BY 1, 2, 3, 4, 5, 6; + hashside_id | hashside_hash | probeside_id | probeside_hash | probeside_trimmed_value | hashside_trimmed_value +-------------+---------------+--------------+----------------+-------------------------+------------------------ + 1 | 0 | 1 | 0 | | + 1 | 0 | 2 | 0 | | + 1 | 0 | 3 | 0 | | + 1 | 0 | 4 | 0 | | + 1 | 0 | 5 | 0 | | + 1 | 0 | 6 | 0 | | + 1 | 0 | 7 | 0 | | + 1 | 0 | 8 | 0 | | + 1 | 0 | 9 | 0 | | + 1 | 0 | 10 | 0 | | + 1 | 0 | 11 | 0 | | + 1 | 0 | 12 | 0 | | + 1 | 0 | 13 | 0 | | + 2 | 0 | 1 | 0 | | + 2 | 0 | 2 | 0 | | + 2 | 0 | 3 | 0 | | + 2 | 0 | 4 | 0 | | + 2 | 0 | 5 | 0 | | + 2 | 0 | 6 | 0 | | + 2 | 0 | 7 | 0 | | + 2 | 0 | 8 | 0 | | + 2 | 0 | 9 | 0 | | + 2 | 0 | 10 | 0 | | + 2 | 0 | 11 | 0 | | + 2 | 0 | 12 | 0 | | + 2 | 0 | 13 | 0 | | + 3 | 0 | 1 | 0 | | + 3 | 0 | 2 | 0 | | + 3 | 0 | 3 | 0 | | + 3 | 0 | 4 | 0 | | + 3 | 0 | 5 | 0 | | + 3 | 0 | 6 | 0 | | + 3 | 0 | 7 | 0 | | + 3 | 0 | 8 | 0 | | + 3 | 0 | 9 | 0 | | + 3 | 0 | 10 | 0 | | + 3 | 0 | 11 | 0 | | + 3 | 0 | 12 | 0 | | + 3 | 0 | 13 | 0 | | + 4 | 0 | 1 | 0 | | + 4 | 0 | 2 | 0 | | + 4 | 0 | 3 | 0 | | + 4 | 0 | 4 | 0 | | + 4 | 0 | 5 | 0 | | + 4 | 0 | 6 | 0 | | + 4 | 0 | 7 | 0 | | + 4 | 0 | 8 | 0 | | + 4 | 0 | 9 | 0 | | + 4 | 0 | 10 | 0 | | + 4 | 0 | 11 | 0 | | + 4 | 0 | 12 | 0 | | + 4 | 0 | 13 | 0 | | + 5 | 0 | 1 | 0 | | + 5 | 0 | 2 | 0 | | + 5 | 0 | 3 | 0 | | + 5 | 0 | 4 | 0 | | + 5 | 0 | 5 | 0 | | + 5 | 0 | 6 | 0 | | + 5 | 0 | 7 | 0 | | + 5 | 0 | 8 | 0 | | + 5 | 0 | 9 | 0 | | + 5 | 0 | 10 | 0 | | + 5 | 0 | 11 | 0 | | + 5 | 0 | 12 | 0 | | + 5 | 0 | 13 | 0 | | + 6 | 0 | 1 | 0 | | + 6 | 0 | 2 | 0 | | + 6 | 0 | 3 | 0 | | + 6 | 0 | 4 | 0 | | + 6 | 0 | 5 | 0 | | + 6 | 0 | 6 | 0 | | + 6 | 0 | 7 | 0 | | + 6 | 0 | 8 | 0 | | + 6 | 0 | 9 | 0 | | + 6 | 0 | 10 | 0 | | + 6 | 0 | 11 | 0 | | + 6 | 0 | 12 | 0 | | + 6 | 0 | 13 | 0 | | + 7 | 0 | 1 | 0 | | + 7 | 0 | 2 | 0 | | + 7 | 0 | 3 | 0 | | + 7 | 0 | 4 | 0 | | + 7 | 0 | 5 | 0 | | + 7 | 0 | 6 | 0 | | + 7 | 0 | 7 | 0 | | + 7 | 0 | 8 | 0 | | + 7 | 0 | 9 | 0 | | + 7 | 0 | 10 | 0 | | + 7 | 0 | 11 | 0 | | + 7 | 0 | 12 | 0 | | + 7 | 0 | 13 | 0 | | + 8 | 0 | 1 | 0 | | + 8 | 0 | 2 | 0 | | + 8 | 0 | 3 | 0 | | + 8 | 0 | 4 | 0 | | + 8 | 0 | 5 | 0 | | + 8 | 0 | 6 | 0 | | + 8 | 0 | 7 | 0 | | + 8 | 0 | 8 | 0 | | + 8 | 0 | 9 | 0 | | + 8 | 0 | 10 | 0 | | + 8 | 0 | 11 | 0 | | + 8 | 0 | 12 | 0 | | + 8 | 0 | 13 | 0 | | + 9 | 0 | 1 | 0 | | + 9 | 0 | 2 | 0 | | + 9 | 0 | 3 | 0 | | + 9 | 0 | 4 | 0 | | + 9 | 0 | 5 | 0 | | + 9 | 0 | 6 | 0 | | + 9 | 0 | 7 | 0 | | + 9 | 0 | 8 | 0 | | + 9 | 0 | 9 | 0 | | + 9 | 0 | 10 | 0 | | + 9 | 0 | 11 | 0 | | + 9 | 0 | 12 | 0 | | + 9 | 0 | 13 | 0 | | + 10 | 0 | 1 | 0 | | + 10 | 0 | 2 | 0 | | + 10 | 0 | 3 | 0 | | + 10 | 0 | 4 | 0 | | + 10 | 0 | 5 | 0 | | + 10 | 0 | 6 | 0 | | + 10 | 0 | 7 | 0 | | + 10 | 0 | 8 | 0 | | + 10 | 0 | 9 | 0 | | + 10 | 0 | 10 | 0 | | + 10 | 0 | 11 | 0 | | + 10 | 0 | 12 | 0 | | + 10 | 0 | 13 | 0 | | + 11 | 0 | 1 | 0 | | + 11 | 0 | 2 | 0 | | + 11 | 0 | 3 | 0 | | + 11 | 0 | 4 | 0 | | + 11 | 0 | 5 | 0 | | + 11 | 0 | 6 | 0 | | + 11 | 0 | 7 | 0 | | + 11 | 0 | 8 | 0 | | + 11 | 0 | 9 | 0 | | + 11 | 0 | 10 | 0 | | + 11 | 0 | 11 | 0 | | + 11 | 0 | 12 | 0 | | + 11 | 0 | 13 | 0 | | + 12 | 0 | 1 | 0 | | + 12 | 0 | 2 | 0 | | + 12 | 0 | 3 | 0 | | + 12 | 0 | 4 | 0 | | + 12 | 0 | 5 | 0 | | + 12 | 0 | 6 | 0 | | + 12 | 0 | 7 | 0 | | + 12 | 0 | 8 | 0 | | + 12 | 0 | 9 | 0 | | + 12 | 0 | 10 | 0 | | + 12 | 0 | 11 | 0 | | + 12 | 0 | 12 | 0 | | + 12 | 0 | 13 | 0 | | + 13 | 0 | 1 | 0 | | + 13 | 0 | 2 | 0 | | + 13 | 0 | 3 | 0 | | + 13 | 0 | 4 | 0 | | + 13 | 0 | 5 | 0 | | + 13 | 0 | 6 | 0 | | + 13 | 0 | 7 | 0 | | + 13 | 0 | 8 | 0 | | + 13 | 0 | 9 | 0 | | + 13 | 0 | 10 | 0 | | + 13 | 0 | 11 | 0 | | + 13 | 0 | 12 | 0 | | + 13 | 0 | 13 | 0 | | + 14 | 0 | 1 | 0 | | + 14 | 0 | 2 | 0 | | + 14 | 0 | 3 | 0 | | + 14 | 0 | 4 | 0 | | + 14 | 0 | 5 | 0 | | + 14 | 0 | 6 | 0 | | + 14 | 0 | 7 | 0 | | + 14 | 0 | 8 | 0 | | + 14 | 0 | 9 | 0 | | + 14 | 0 | 10 | 0 | | + 14 | 0 | 11 | 0 | | + 14 | 0 | 12 | 0 | | + 14 | 0 | 13 | 0 | | + 15 | 0 | 1 | 0 | | + 15 | 0 | 2 | 0 | | + 15 | 0 | 3 | 0 | | + 15 | 0 | 4 | 0 | | + 15 | 0 | 5 | 0 | | + 15 | 0 | 6 | 0 | | + 15 | 0 | 7 | 0 | | + 15 | 0 | 8 | 0 | | + 15 | 0 | 9 | 0 | | + 15 | 0 | 10 | 0 | | + 15 | 0 | 11 | 0 | | + 15 | 0 | 12 | 0 | | + 15 | 0 | 13 | 0 | | + 16 | 0 | 1 | 0 | | + 16 | 0 | 2 | 0 | | + 16 | 0 | 3 | 0 | | + 16 | 0 | 4 | 0 | | + 16 | 0 | 5 | 0 | | + 16 | 0 | 6 | 0 | | + 16 | 0 | 7 | 0 | | + 16 | 0 | 8 | 0 | | + 16 | 0 | 9 | 0 | | + 16 | 0 | 10 | 0 | | + 16 | 0 | 11 | 0 | | + 16 | 0 | 12 | 0 | | + 16 | 0 | 13 | 0 | | + 17 | 0 | 1 | 0 | | + 17 | 0 | 2 | 0 | | + 17 | 0 | 3 | 0 | | + 17 | 0 | 4 | 0 | | + 17 | 0 | 5 | 0 | | + 17 | 0 | 6 | 0 | | + 17 | 0 | 7 | 0 | | + 17 | 0 | 8 | 0 | | + 17 | 0 | 9 | 0 | | + 17 | 0 | 10 | 0 | | + 17 | 0 | 11 | 0 | | + 17 | 0 | 12 | 0 | | + 17 | 0 | 13 | 0 | | + 18 | 0 | 1 | 0 | | + 18 | 0 | 2 | 0 | | + 18 | 0 | 3 | 0 | | + 18 | 0 | 4 | 0 | | + 18 | 0 | 5 | 0 | | + 18 | 0 | 6 | 0 | | + 18 | 0 | 7 | 0 | | + 18 | 0 | 8 | 0 | | + 18 | 0 | 9 | 0 | | + 18 | 0 | 10 | 0 | | + 18 | 0 | 11 | 0 | | + 18 | 0 | 12 | 0 | | + 18 | 0 | 13 | 0 | | + 19 | 0 | 1 | 0 | | + 19 | 0 | 2 | 0 | | + 19 | 0 | 3 | 0 | | + 19 | 0 | 4 | 0 | | + 19 | 0 | 5 | 0 | | + 19 | 0 | 6 | 0 | | + 19 | 0 | 7 | 0 | | + 19 | 0 | 8 | 0 | | + 19 | 0 | 9 | 0 | | + 19 | 0 | 10 | 0 | | + 19 | 0 | 11 | 0 | | + 19 | 0 | 12 | 0 | | + 19 | 0 | 13 | 0 | | + 20 | 0 | 1 | 0 | | + 20 | 0 | 2 | 0 | | + 20 | 0 | 3 | 0 | | + 20 | 0 | 4 | 0 | | + 20 | 0 | 5 | 0 | | + 20 | 0 | 6 | 0 | | + 20 | 0 | 7 | 0 | | + 20 | 0 | 8 | 0 | | + 20 | 0 | 9 | 0 | | + 20 | 0 | 10 | 0 | | + 20 | 0 | 11 | 0 | | + 20 | 0 | 12 | 0 | | + 20 | 0 | 13 | 0 | | + 21 | 0 | 1 | 0 | | + 21 | 0 | 2 | 0 | | + 21 | 0 | 3 | 0 | | + 21 | 0 | 4 | 0 | | + 21 | 0 | 5 | 0 | | + 21 | 0 | 6 | 0 | | + 21 | 0 | 7 | 0 | | + 21 | 0 | 8 | 0 | | + 21 | 0 | 9 | 0 | | + 21 | 0 | 10 | 0 | | + 21 | 0 | 11 | 0 | | + 21 | 0 | 12 | 0 | | + 21 | 0 | 13 | 0 | | + 22 | 0 | 1 | 0 | | + 22 | 0 | 2 | 0 | | + 22 | 0 | 3 | 0 | | + 22 | 0 | 4 | 0 | | + 22 | 0 | 5 | 0 | | + 22 | 0 | 6 | 0 | | + 22 | 0 | 7 | 0 | | + 22 | 0 | 8 | 0 | | + 22 | 0 | 9 | 0 | | + 22 | 0 | 10 | 0 | | + 22 | 0 | 11 | 0 | | + 22 | 0 | 12 | 0 | | + 22 | 0 | 13 | 0 | | + 23 | 0 | 1 | 0 | | + 23 | 0 | 2 | 0 | | + 23 | 0 | 3 | 0 | | + 23 | 0 | 4 | 0 | | + 23 | 0 | 5 | 0 | | + 23 | 0 | 6 | 0 | | + 23 | 0 | 7 | 0 | | + 23 | 0 | 8 | 0 | | + 23 | 0 | 9 | 0 | | + 23 | 0 | 10 | 0 | | + 23 | 0 | 11 | 0 | | + 23 | 0 | 12 | 0 | | + 23 | 0 | 13 | 0 | | + 24 | 0 | 1 | 0 | | + 24 | 0 | 2 | 0 | | + 24 | 0 | 3 | 0 | | + 24 | 0 | 4 | 0 | | + 24 | 0 | 5 | 0 | | + 24 | 0 | 6 | 0 | | + 24 | 0 | 7 | 0 | | + 24 | 0 | 8 | 0 | | + 24 | 0 | 9 | 0 | | + 24 | 0 | 10 | 0 | | + 24 | 0 | 11 | 0 | | + 24 | 0 | 12 | 0 | | + 24 | 0 | 13 | 0 | | + 25 | 0 | 1 | 0 | | + 25 | 0 | 2 | 0 | | + 25 | 0 | 3 | 0 | | + 25 | 0 | 4 | 0 | | + 25 | 0 | 5 | 0 | | + 25 | 0 | 6 | 0 | | + 25 | 0 | 7 | 0 | | + 25 | 0 | 8 | 0 | | + 25 | 0 | 9 | 0 | | + 25 | 0 | 10 | 0 | | + 25 | 0 | 11 | 0 | | + 25 | 0 | 12 | 0 | | + 25 | 0 | 13 | 0 | | + 26 | 0 | 1 | 0 | | + 26 | 0 | 2 | 0 | | + 26 | 0 | 3 | 0 | | + 26 | 0 | 4 | 0 | | + 26 | 0 | 5 | 0 | | + 26 | 0 | 6 | 0 | | + 26 | 0 | 7 | 0 | | + 26 | 0 | 8 | 0 | | + 26 | 0 | 9 | 0 | | + 26 | 0 | 10 | 0 | | + 26 | 0 | 11 | 0 | | + 26 | 0 | 12 | 0 | | + 26 | 0 | 13 | 0 | | + 27 | 0 | 1 | 0 | | + 27 | 0 | 2 | 0 | | + 27 | 0 | 3 | 0 | | + 27 | 0 | 4 | 0 | | + 27 | 0 | 5 | 0 | | + 27 | 0 | 6 | 0 | | + 27 | 0 | 7 | 0 | | + 27 | 0 | 8 | 0 | | + 27 | 0 | 9 | 0 | | + 27 | 0 | 10 | 0 | | + 27 | 0 | 11 | 0 | | + 27 | 0 | 12 | 0 | | + 27 | 0 | 13 | 0 | | + | | 14 | 0 | unmatched outer | +(352 rows) + +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_hashjoin = on; +savepoint settings; +set max_parallel_workers_per_gather = 1; +set enable_parallel_hash = on; +set work_mem = '64kB'; +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside_batch0 +LEFT OUTER JOIN hashside_wide_batch0 USING (a); + QUERY PLAN +-------------------------------------------------------------------------------------- + Gather (actual rows=469 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Parallel Hash Left Join (actual rows=234 loops=2) + Hash Cond: (probeside_batch0.a = hashside_wide_batch0.a) + -> Parallel Seq Scan on probeside_batch0 (actual rows=14 loops=1) + -> Parallel Hash (actual rows=18 loops=2) + Buckets: 8 (originally 8) Batches: 16 (originally 8) + Batch: 0 Stripes: 5 + -> Parallel Seq Scan on hashside_wide_batch0 (actual rows=36 loops=1) +(10 rows) + +SELECT + hashside_wide_batch0.id as hashside_id, + (hashside_wide_batch0.a).hash as hashside_hash, + probeside_batch0.id as probeside_id, + (probeside_batch0.a).hash as probeside_hash, + TRIM((probeside_batch0.a).value) as probeside_trimmed_value, + TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value +FROM probeside_batch0 +LEFT OUTER JOIN hashside_wide_batch0 USING (a) +ORDER BY 1, 2, 3, 4, 5, 6; + hashside_id | hashside_hash | probeside_id | probeside_hash | probeside_trimmed_value | hashside_trimmed_value +-------------+---------------+--------------+----------------+-------------------------+------------------------ + 1 | 0 | 1 | 0 | | + 1 | 0 | 2 | 0 | | + 1 | 0 | 3 | 0 | | + 1 | 0 | 4 | 0 | | + 1 | 0 | 5 | 0 | | + 1 | 0 | 6 | 0 | | + 1 | 0 | 7 | 0 | | + 1 | 0 | 8 | 0 | | + 1 | 0 | 9 | 0 | | + 1 | 0 | 10 | 0 | | + 1 | 0 | 11 | 0 | | + 1 | 0 | 12 | 0 | | + 1 | 0 | 13 | 0 | | + 2 | 0 | 1 | 0 | | + 2 | 0 | 2 | 0 | | + 2 | 0 | 3 | 0 | | + 2 | 0 | 4 | 0 | | + 2 | 0 | 5 | 0 | | + 2 | 0 | 6 | 0 | | + 2 | 0 | 7 | 0 | | + 2 | 0 | 8 | 0 | | + 2 | 0 | 9 | 0 | | + 2 | 0 | 10 | 0 | | + 2 | 0 | 11 | 0 | | + 2 | 0 | 12 | 0 | | + 2 | 0 | 13 | 0 | | + 3 | 0 | 1 | 0 | | + 3 | 0 | 2 | 0 | | + 3 | 0 | 3 | 0 | | + 3 | 0 | 4 | 0 | | + 3 | 0 | 5 | 0 | | + 3 | 0 | 6 | 0 | | + 3 | 0 | 7 | 0 | | + 3 | 0 | 8 | 0 | | + 3 | 0 | 9 | 0 | | + 3 | 0 | 10 | 0 | | + 3 | 0 | 11 | 0 | | + 3 | 0 | 12 | 0 | | + 3 | 0 | 13 | 0 | | + 4 | 0 | 1 | 0 | | + 4 | 0 | 2 | 0 | | + 4 | 0 | 3 | 0 | | + 4 | 0 | 4 | 0 | | + 4 | 0 | 5 | 0 | | + 4 | 0 | 6 | 0 | | + 4 | 0 | 7 | 0 | | + 4 | 0 | 8 | 0 | | + 4 | 0 | 9 | 0 | | + 4 | 0 | 10 | 0 | | + 4 | 0 | 11 | 0 | | + 4 | 0 | 12 | 0 | | + 4 | 0 | 13 | 0 | | + 5 | 0 | 1 | 0 | | + 5 | 0 | 2 | 0 | | + 5 | 0 | 3 | 0 | | + 5 | 0 | 4 | 0 | | + 5 | 0 | 5 | 0 | | + 5 | 0 | 6 | 0 | | + 5 | 0 | 7 | 0 | | + 5 | 0 | 8 | 0 | | + 5 | 0 | 9 | 0 | | + 5 | 0 | 10 | 0 | | + 5 | 0 | 11 | 0 | | + 5 | 0 | 12 | 0 | | + 5 | 0 | 13 | 0 | | + 6 | 0 | 1 | 0 | | + 6 | 0 | 2 | 0 | | + 6 | 0 | 3 | 0 | | + 6 | 0 | 4 | 0 | | + 6 | 0 | 5 | 0 | | + 6 | 0 | 6 | 0 | | + 6 | 0 | 7 | 0 | | + 6 | 0 | 8 | 0 | | + 6 | 0 | 9 | 0 | | + 6 | 0 | 10 | 0 | | + 6 | 0 | 11 | 0 | | + 6 | 0 | 12 | 0 | | + 6 | 0 | 13 | 0 | | + 7 | 0 | 1 | 0 | | + 7 | 0 | 2 | 0 | | + 7 | 0 | 3 | 0 | | + 7 | 0 | 4 | 0 | | + 7 | 0 | 5 | 0 | | + 7 | 0 | 6 | 0 | | + 7 | 0 | 7 | 0 | | + 7 | 0 | 8 | 0 | | + 7 | 0 | 9 | 0 | | + 7 | 0 | 10 | 0 | | + 7 | 0 | 11 | 0 | | + 7 | 0 | 12 | 0 | | + 7 | 0 | 13 | 0 | | + 8 | 0 | 1 | 0 | | + 8 | 0 | 2 | 0 | | + 8 | 0 | 3 | 0 | | + 8 | 0 | 4 | 0 | | + 8 | 0 | 5 | 0 | | + 8 | 0 | 6 | 0 | | + 8 | 0 | 7 | 0 | | + 8 | 0 | 8 | 0 | | + 8 | 0 | 9 | 0 | | + 8 | 0 | 10 | 0 | | + 8 | 0 | 11 | 0 | | + 8 | 0 | 12 | 0 | | + 8 | 0 | 13 | 0 | | + 9 | 0 | 1 | 0 | | + 9 | 0 | 2 | 0 | | + 9 | 0 | 3 | 0 | | + 9 | 0 | 4 | 0 | | + 9 | 0 | 5 | 0 | | + 9 | 0 | 6 | 0 | | + 9 | 0 | 7 | 0 | | + 9 | 0 | 8 | 0 | | + 9 | 0 | 9 | 0 | | + 9 | 0 | 10 | 0 | | + 9 | 0 | 11 | 0 | | + 9 | 0 | 12 | 0 | | + 9 | 0 | 13 | 0 | | + 10 | 0 | 1 | 0 | | + 10 | 0 | 2 | 0 | | + 10 | 0 | 3 | 0 | | + 10 | 0 | 4 | 0 | | + 10 | 0 | 5 | 0 | | + 10 | 0 | 6 | 0 | | + 10 | 0 | 7 | 0 | | + 10 | 0 | 8 | 0 | | + 10 | 0 | 9 | 0 | | + 10 | 0 | 10 | 0 | | + 10 | 0 | 11 | 0 | | + 10 | 0 | 12 | 0 | | + 10 | 0 | 13 | 0 | | + 11 | 0 | 1 | 0 | | + 11 | 0 | 2 | 0 | | + 11 | 0 | 3 | 0 | | + 11 | 0 | 4 | 0 | | + 11 | 0 | 5 | 0 | | + 11 | 0 | 6 | 0 | | + 11 | 0 | 7 | 0 | | + 11 | 0 | 8 | 0 | | + 11 | 0 | 9 | 0 | | + 11 | 0 | 10 | 0 | | + 11 | 0 | 11 | 0 | | + 11 | 0 | 12 | 0 | | + 11 | 0 | 13 | 0 | | + 12 | 0 | 1 | 0 | | + 12 | 0 | 2 | 0 | | + 12 | 0 | 3 | 0 | | + 12 | 0 | 4 | 0 | | + 12 | 0 | 5 | 0 | | + 12 | 0 | 6 | 0 | | + 12 | 0 | 7 | 0 | | + 12 | 0 | 8 | 0 | | + 12 | 0 | 9 | 0 | | + 12 | 0 | 10 | 0 | | + 12 | 0 | 11 | 0 | | + 12 | 0 | 12 | 0 | | + 12 | 0 | 13 | 0 | | + 13 | 0 | 1 | 0 | | + 13 | 0 | 2 | 0 | | + 13 | 0 | 3 | 0 | | + 13 | 0 | 4 | 0 | | + 13 | 0 | 5 | 0 | | + 13 | 0 | 6 | 0 | | + 13 | 0 | 7 | 0 | | + 13 | 0 | 8 | 0 | | + 13 | 0 | 9 | 0 | | + 13 | 0 | 10 | 0 | | + 13 | 0 | 11 | 0 | | + 13 | 0 | 12 | 0 | | + 13 | 0 | 13 | 0 | | + 14 | 0 | 1 | 0 | | + 14 | 0 | 2 | 0 | | + 14 | 0 | 3 | 0 | | + 14 | 0 | 4 | 0 | | + 14 | 0 | 5 | 0 | | + 14 | 0 | 6 | 0 | | + 14 | 0 | 7 | 0 | | + 14 | 0 | 8 | 0 | | + 14 | 0 | 9 | 0 | | + 14 | 0 | 10 | 0 | | + 14 | 0 | 11 | 0 | | + 14 | 0 | 12 | 0 | | + 14 | 0 | 13 | 0 | | + 15 | 0 | 1 | 0 | | + 15 | 0 | 2 | 0 | | + 15 | 0 | 3 | 0 | | + 15 | 0 | 4 | 0 | | + 15 | 0 | 5 | 0 | | + 15 | 0 | 6 | 0 | | + 15 | 0 | 7 | 0 | | + 15 | 0 | 8 | 0 | | + 15 | 0 | 9 | 0 | | + 15 | 0 | 10 | 0 | | + 15 | 0 | 11 | 0 | | + 15 | 0 | 12 | 0 | | + 15 | 0 | 13 | 0 | | + 16 | 0 | 1 | 0 | | + 16 | 0 | 2 | 0 | | + 16 | 0 | 3 | 0 | | + 16 | 0 | 4 | 0 | | + 16 | 0 | 5 | 0 | | + 16 | 0 | 6 | 0 | | + 16 | 0 | 7 | 0 | | + 16 | 0 | 8 | 0 | | + 16 | 0 | 9 | 0 | | + 16 | 0 | 10 | 0 | | + 16 | 0 | 11 | 0 | | + 16 | 0 | 12 | 0 | | + 16 | 0 | 13 | 0 | | + 17 | 0 | 1 | 0 | | + 17 | 0 | 2 | 0 | | + 17 | 0 | 3 | 0 | | + 17 | 0 | 4 | 0 | | + 17 | 0 | 5 | 0 | | + 17 | 0 | 6 | 0 | | + 17 | 0 | 7 | 0 | | + 17 | 0 | 8 | 0 | | + 17 | 0 | 9 | 0 | | + 17 | 0 | 10 | 0 | | + 17 | 0 | 11 | 0 | | + 17 | 0 | 12 | 0 | | + 17 | 0 | 13 | 0 | | + 18 | 0 | 1 | 0 | | + 18 | 0 | 2 | 0 | | + 18 | 0 | 3 | 0 | | + 18 | 0 | 4 | 0 | | + 18 | 0 | 5 | 0 | | + 18 | 0 | 6 | 0 | | + 18 | 0 | 7 | 0 | | + 18 | 0 | 8 | 0 | | + 18 | 0 | 9 | 0 | | + 18 | 0 | 10 | 0 | | + 18 | 0 | 11 | 0 | | + 18 | 0 | 12 | 0 | | + 18 | 0 | 13 | 0 | | + 19 | 0 | 1 | 0 | | + 19 | 0 | 2 | 0 | | + 19 | 0 | 3 | 0 | | + 19 | 0 | 4 | 0 | | + 19 | 0 | 5 | 0 | | + 19 | 0 | 6 | 0 | | + 19 | 0 | 7 | 0 | | + 19 | 0 | 8 | 0 | | + 19 | 0 | 9 | 0 | | + 19 | 0 | 10 | 0 | | + 19 | 0 | 11 | 0 | | + 19 | 0 | 12 | 0 | | + 19 | 0 | 13 | 0 | | + 20 | 0 | 1 | 0 | | + 20 | 0 | 2 | 0 | | + 20 | 0 | 3 | 0 | | + 20 | 0 | 4 | 0 | | + 20 | 0 | 5 | 0 | | + 20 | 0 | 6 | 0 | | + 20 | 0 | 7 | 0 | | + 20 | 0 | 8 | 0 | | + 20 | 0 | 9 | 0 | | + 20 | 0 | 10 | 0 | | + 20 | 0 | 11 | 0 | | + 20 | 0 | 12 | 0 | | + 20 | 0 | 13 | 0 | | + 21 | 0 | 1 | 0 | | + 21 | 0 | 2 | 0 | | + 21 | 0 | 3 | 0 | | + 21 | 0 | 4 | 0 | | + 21 | 0 | 5 | 0 | | + 21 | 0 | 6 | 0 | | + 21 | 0 | 7 | 0 | | + 21 | 0 | 8 | 0 | | + 21 | 0 | 9 | 0 | | + 21 | 0 | 10 | 0 | | + 21 | 0 | 11 | 0 | | + 21 | 0 | 12 | 0 | | + 21 | 0 | 13 | 0 | | + 22 | 0 | 1 | 0 | | + 22 | 0 | 2 | 0 | | + 22 | 0 | 3 | 0 | | + 22 | 0 | 4 | 0 | | + 22 | 0 | 5 | 0 | | + 22 | 0 | 6 | 0 | | + 22 | 0 | 7 | 0 | | + 22 | 0 | 8 | 0 | | + 22 | 0 | 9 | 0 | | + 22 | 0 | 10 | 0 | | + 22 | 0 | 11 | 0 | | + 22 | 0 | 12 | 0 | | + 22 | 0 | 13 | 0 | | + 23 | 0 | 1 | 0 | | + 23 | 0 | 2 | 0 | | + 23 | 0 | 3 | 0 | | + 23 | 0 | 4 | 0 | | + 23 | 0 | 5 | 0 | | + 23 | 0 | 6 | 0 | | + 23 | 0 | 7 | 0 | | + 23 | 0 | 8 | 0 | | + 23 | 0 | 9 | 0 | | + 23 | 0 | 10 | 0 | | + 23 | 0 | 11 | 0 | | + 23 | 0 | 12 | 0 | | + 23 | 0 | 13 | 0 | | + 24 | 0 | 1 | 0 | | + 24 | 0 | 2 | 0 | | + 24 | 0 | 3 | 0 | | + 24 | 0 | 4 | 0 | | + 24 | 0 | 5 | 0 | | + 24 | 0 | 6 | 0 | | + 24 | 0 | 7 | 0 | | + 24 | 0 | 8 | 0 | | + 24 | 0 | 9 | 0 | | + 24 | 0 | 10 | 0 | | + 24 | 0 | 11 | 0 | | + 24 | 0 | 12 | 0 | | + 24 | 0 | 13 | 0 | | + 25 | 0 | 1 | 0 | | + 25 | 0 | 2 | 0 | | + 25 | 0 | 3 | 0 | | + 25 | 0 | 4 | 0 | | + 25 | 0 | 5 | 0 | | + 25 | 0 | 6 | 0 | | + 25 | 0 | 7 | 0 | | + 25 | 0 | 8 | 0 | | + 25 | 0 | 9 | 0 | | + 25 | 0 | 10 | 0 | | + 25 | 0 | 11 | 0 | | + 25 | 0 | 12 | 0 | | + 25 | 0 | 13 | 0 | | + 26 | 0 | 1 | 0 | | + 26 | 0 | 2 | 0 | | + 26 | 0 | 3 | 0 | | + 26 | 0 | 4 | 0 | | + 26 | 0 | 5 | 0 | | + 26 | 0 | 6 | 0 | | + 26 | 0 | 7 | 0 | | + 26 | 0 | 8 | 0 | | + 26 | 0 | 9 | 0 | | + 26 | 0 | 10 | 0 | | + 26 | 0 | 11 | 0 | | + 26 | 0 | 12 | 0 | | + 26 | 0 | 13 | 0 | | + 27 | 0 | 1 | 0 | | + 27 | 0 | 2 | 0 | | + 27 | 0 | 3 | 0 | | + 27 | 0 | 4 | 0 | | + 27 | 0 | 5 | 0 | | + 27 | 0 | 6 | 0 | | + 27 | 0 | 7 | 0 | | + 27 | 0 | 8 | 0 | | + 27 | 0 | 9 | 0 | | + 27 | 0 | 10 | 0 | | + 27 | 0 | 11 | 0 | | + 27 | 0 | 12 | 0 | | + 27 | 0 | 13 | 0 | | + 28 | 0 | 1 | 0 | | + 28 | 0 | 2 | 0 | | + 28 | 0 | 3 | 0 | | + 28 | 0 | 4 | 0 | | + 28 | 0 | 5 | 0 | | + 28 | 0 | 6 | 0 | | + 28 | 0 | 7 | 0 | | + 28 | 0 | 8 | 0 | | + 28 | 0 | 9 | 0 | | + 28 | 0 | 10 | 0 | | + 28 | 0 | 11 | 0 | | + 28 | 0 | 12 | 0 | | + 28 | 0 | 13 | 0 | | + 29 | 0 | 1 | 0 | | + 29 | 0 | 2 | 0 | | + 29 | 0 | 3 | 0 | | + 29 | 0 | 4 | 0 | | + 29 | 0 | 5 | 0 | | + 29 | 0 | 6 | 0 | | + 29 | 0 | 7 | 0 | | + 29 | 0 | 8 | 0 | | + 29 | 0 | 9 | 0 | | + 29 | 0 | 10 | 0 | | + 29 | 0 | 11 | 0 | | + 29 | 0 | 12 | 0 | | + 29 | 0 | 13 | 0 | | + 30 | 0 | 1 | 0 | | + 30 | 0 | 2 | 0 | | + 30 | 0 | 3 | 0 | | + 30 | 0 | 4 | 0 | | + 30 | 0 | 5 | 0 | | + 30 | 0 | 6 | 0 | | + 30 | 0 | 7 | 0 | | + 30 | 0 | 8 | 0 | | + 30 | 0 | 9 | 0 | | + 30 | 0 | 10 | 0 | | + 30 | 0 | 11 | 0 | | + 30 | 0 | 12 | 0 | | + 30 | 0 | 13 | 0 | | + 31 | 0 | 1 | 0 | | + 31 | 0 | 2 | 0 | | + 31 | 0 | 3 | 0 | | + 31 | 0 | 4 | 0 | | + 31 | 0 | 5 | 0 | | + 31 | 0 | 6 | 0 | | + 31 | 0 | 7 | 0 | | + 31 | 0 | 8 | 0 | | + 31 | 0 | 9 | 0 | | + 31 | 0 | 10 | 0 | | + 31 | 0 | 11 | 0 | | + 31 | 0 | 12 | 0 | | + 31 | 0 | 13 | 0 | | + 32 | 0 | 1 | 0 | | + 32 | 0 | 2 | 0 | | + 32 | 0 | 3 | 0 | | + 32 | 0 | 4 | 0 | | + 32 | 0 | 5 | 0 | | + 32 | 0 | 6 | 0 | | + 32 | 0 | 7 | 0 | | + 32 | 0 | 8 | 0 | | + 32 | 0 | 9 | 0 | | + 32 | 0 | 10 | 0 | | + 32 | 0 | 11 | 0 | | + 32 | 0 | 12 | 0 | | + 32 | 0 | 13 | 0 | | + 33 | 0 | 1 | 0 | | + 33 | 0 | 2 | 0 | | + 33 | 0 | 3 | 0 | | + 33 | 0 | 4 | 0 | | + 33 | 0 | 5 | 0 | | + 33 | 0 | 6 | 0 | | + 33 | 0 | 7 | 0 | | + 33 | 0 | 8 | 0 | | + 33 | 0 | 9 | 0 | | + 33 | 0 | 10 | 0 | | + 33 | 0 | 11 | 0 | | + 33 | 0 | 12 | 0 | | + 33 | 0 | 13 | 0 | | + 34 | 0 | 1 | 0 | | + 34 | 0 | 2 | 0 | | + 34 | 0 | 3 | 0 | | + 34 | 0 | 4 | 0 | | + 34 | 0 | 5 | 0 | | + 34 | 0 | 6 | 0 | | + 34 | 0 | 7 | 0 | | + 34 | 0 | 8 | 0 | | + 34 | 0 | 9 | 0 | | + 34 | 0 | 10 | 0 | | + 34 | 0 | 11 | 0 | | + 34 | 0 | 12 | 0 | | + 34 | 0 | 13 | 0 | | + 35 | 0 | 1 | 0 | | + 35 | 0 | 2 | 0 | | + 35 | 0 | 3 | 0 | | + 35 | 0 | 4 | 0 | | + 35 | 0 | 5 | 0 | | + 35 | 0 | 6 | 0 | | + 35 | 0 | 7 | 0 | | + 35 | 0 | 8 | 0 | | + 35 | 0 | 9 | 0 | | + 35 | 0 | 10 | 0 | | + 35 | 0 | 11 | 0 | | + 35 | 0 | 12 | 0 | | + 35 | 0 | 13 | 0 | | + 36 | 0 | 1 | 0 | | + 36 | 0 | 2 | 0 | | + 36 | 0 | 3 | 0 | | + 36 | 0 | 4 | 0 | | + 36 | 0 | 5 | 0 | | + 36 | 0 | 6 | 0 | | + 36 | 0 | 7 | 0 | | + 36 | 0 | 8 | 0 | | + 36 | 0 | 9 | 0 | | + 36 | 0 | 10 | 0 | | + 36 | 0 | 11 | 0 | | + 36 | 0 | 12 | 0 | | + 36 | 0 | 13 | 0 | | + | | 14 | 0 | unmatched outer | +(469 rows) + +rollback to settings; +rollback; diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql index 68c1a8c7b6..d9f8a115d8 100644 --- a/src/test/regress/sql/join_hash.sql +++ b/src/test/regress/sql/join_hash.sql @@ -450,22 +450,26 @@ rollback to settings; -- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and -- sts_puttuple oversized tuple cases because it's multi-batch) -savepoint settings; -set max_parallel_workers_per_gather = 2; -set enable_parallel_hash = on; -set work_mem = '128kB'; -explain (costs off) - select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); -select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); -select final > 1 as multibatch - from hash_join_batches( -$$ - select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); -$$); -rollback to settings; +-- savepoint settings; +-- set max_parallel_workers_per_gather = 2; +-- set enable_parallel_hash = on; +-- TODO: throw an error when this happens: cannot set work_mem lower than the side of a single tuple +-- TODO: ensure that oversize tuple code is still exercised (should be with some of the stub stuff below) +-- TODO: commented this out since it would crash otherwise +-- this test is no longer multi-batch, so, perhaps, it should be removed +-- set work_mem = '128kB'; +-- explain (costs off) +-- select length(max(s.t)) +-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +-- select length(max(s.t)) +-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +-- select final > 1 as multibatch +-- from hash_join_batches( +-- $$ +-- select length(max(s.t)) +-- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +-- $$); +-- rollback to settings; rollback; @@ -538,3 +542,181 @@ WHERE AND hjtest_1.a <> hjtest_2.b; ROLLBACK; + +-- Serial Adaptive Hash Join + +BEGIN; +CREATE TYPE stub AS (hash INTEGER, value CHAR(8090)); + +CREATE FUNCTION stub_hash(item stub) +RETURNS INTEGER AS $$ +DECLARE + batch_size INTEGER; +BEGIN + batch_size := 4; + RETURN item.hash << (batch_size - 1); +END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE; + +CREATE FUNCTION stub_eq(item1 stub, item2 stub) +RETURNS BOOLEAN AS $$ +BEGIN + RETURN item1.hash = item2.hash AND item1.value = item2.value; +END; $$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT PARALLEL SAFE; + +CREATE OPERATOR = ( + FUNCTION = stub_eq, + LEFTARG = stub, + RIGHTARG = stub, + COMMUTATOR = =, + HASHES, MERGES +); + +CREATE OPERATOR CLASS stub_hash_ops +DEFAULT FOR TYPE stub USING hash AS + OPERATOR 1 =(stub, stub), + FUNCTION 1 stub_hash(stub); + +CREATE TABLE probeside(a stub); +ALTER TABLE probeside ALTER COLUMN a SET STORAGE PLAIN; +-- non-fallback batch with unmatched outer tuple +INSERT INTO probeside SELECT '(2, "")' FROM generate_series(1, 1); +-- fallback batch unmatched outer tuple (in first stripe maybe) +INSERT INTO probeside SELECT '(1, "unmatched outer tuple")' FROM generate_series(1, 1); +-- fallback batch matched outer tuple +INSERT INTO probeside SELECT '(1, "")' FROM generate_series(1, 5); +-- fallback batch unmatched outer tuple (in last stripe maybe) +-- When numbatches=4, hash 5 maps to batch 1, but after numbatches doubles to +-- 8 batches hash 5 maps to batch 5. +INSERT INTO probeside SELECT '(5, "")' FROM generate_series(1, 1); +-- non-fallback batch matched outer tuple +INSERT INTO probeside SELECT '(3, "")' FROM generate_series(1, 1); +-- batch with 3 stripes where non-first/non-last stripe contains unmatched outer tuple +INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 5); +INSERT INTO probeside SELECT '(6, "unmatched outer tuple")' FROM generate_series(1, 1); +INSERT INTO probeside SELECT '(6, "")' FROM generate_series(1, 1); + +CREATE TABLE hashside_wide(a stub, id int); +ALTER TABLE hashside_wide ALTER COLUMN a SET STORAGE PLAIN; +-- falls back with an unmatched inner tuple that is in fist, middle, and last +-- stripe +INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in first stripe")', 1 FROM generate_series(1, 1); +INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9); +INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in middle stripe")', 1 FROM generate_series(1, 1); +INSERT INTO hashside_wide SELECT '(1, "")', 1 FROM generate_series(1, 9); +INSERT INTO hashside_wide SELECT '(1, "unmatched inner tuple in last stripe")', 1 FROM generate_series(1, 1); + +-- doesn't fall back -- matched tuple +INSERT INTO hashside_wide SELECT '(3, "")', 3 FROM generate_series(1, 1); +INSERT INTO hashside_wide SELECT '(6, "")', 6 FROM generate_series(1, 20); + +ANALYZE probeside, hashside_wide; + +SET enable_nestloop TO off; +SET enable_mergejoin TO off; +SET work_mem = 64; + +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +LEFT OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +LEFT OUTER JOIN hashside_wide USING (a); + +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +RIGHT OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +RIGHT OUTER JOIN hashside_wide USING (a); + +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +FULL OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +FULL OUTER JOIN hashside_wide USING (a); + +-- semi-join testcase +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) +SELECT probeside.* FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a); + +SELECT (probeside.a).hash, TRIM((probeside.a).value) +FROM probeside WHERE EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2; + +-- anti-join testcase +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) +SELECT probeside.* FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a); + +SELECT (probeside.a).hash, TRIM((probeside.a).value) +FROM probeside WHERE NOT EXISTS (SELECT * FROM hashside_wide WHERE probeside.a=a) ORDER BY 1, 2; + +-- parallel LOJ test case with two batches falling back +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_parallel_hash = on; + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside +LEFT OUTER JOIN hashside_wide USING (a); + +SELECT (probeside.a).hash, TRIM((probeside.a).value), hashside_wide.id, (hashside_wide.a).hash, TRIM((hashside_wide.a).value) +FROM probeside +LEFT OUTER JOIN hashside_wide USING (a) +ORDER BY 1, 2, 3, 4, 5; +rollback to settings; + +-- Test spill of batch 0 gives correct results. +CREATE TABLE probeside_batch0(id int generated always as identity, a stub); +ALTER TABLE probeside_batch0 ALTER COLUMN a SET STORAGE PLAIN; +INSERT INTO probeside_batch0(a) SELECT '(0, "")' FROM generate_series(1, 13); +INSERT INTO probeside_batch0(a) SELECT '(0, "unmatched outer")' FROM generate_series(1, 1); + +CREATE TABLE hashside_wide_batch0(id int generated always as identity, a stub); +ALTER TABLE hashside_wide_batch0 ALTER COLUMN a SET STORAGE PLAIN; +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); +ANALYZE probeside_batch0, hashside_wide_batch0; + +SELECT + hashside_wide_batch0.id as hashside_id, + (hashside_wide_batch0.a).hash as hashside_hash, + probeside_batch0.id as probeside_id, + (probeside_batch0.a).hash as probeside_hash, + TRIM((probeside_batch0.a).value) as probeside_trimmed_value, + TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value +FROM probeside_batch0 +LEFT OUTER JOIN hashside_wide_batch0 USING (a) +ORDER BY 1, 2, 3, 4, 5, 6; + +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_hashjoin = on; + +savepoint settings; +set max_parallel_workers_per_gather = 1; +set enable_parallel_hash = on; +set work_mem = '64kB'; + +INSERT INTO hashside_wide_batch0(a) SELECT '(0, "")' FROM generate_series(1, 9); + +EXPLAIN (ANALYZE, summary off, timing off, costs off, usage off) SELECT * FROM probeside_batch0 +LEFT OUTER JOIN hashside_wide_batch0 USING (a); + +SELECT + hashside_wide_batch0.id as hashside_id, + (hashside_wide_batch0.a).hash as hashside_hash, + probeside_batch0.id as probeside_id, + (probeside_batch0.a).hash as probeside_hash, + TRIM((probeside_batch0.a).value) as probeside_trimmed_value, + TRIM((hashside_wide_batch0.a).value) as hashside_trimmed_value +FROM probeside_batch0 +LEFT OUTER JOIN hashside_wide_batch0 USING (a) +ORDER BY 1, 2, 3, 4, 5, 6; +rollback to settings; + +rollback; -- 2.20.1