From fe050ef3a451660f90a3801e759aa4257a9abfd1 Mon Sep 17 00:00:00 2001 From: Jehan-Guillaume de Rorthais Date: Mon, 27 Mar 2023 15:54:39 +0200 Subject: [PATCH 2/4] Allocate hash batches related BufFile in a dedicated context --- src/backend/executor/nodeHash.c | 35 +++++++++++++++++++++-------- src/backend/executor/nodeHashjoin.c | 18 ++++++++++----- src/include/executor/hashjoin.h | 15 ++++++++++--- src/include/executor/nodeHashjoin.h | 2 +- 4 files changed, 52 insertions(+), 18 deletions(-) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index a45bd3a315..4544296391 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -484,7 +484,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, * * The hashtable control block is just palloc'd from the executor's * per-query memory context. Everything else should be kept inside the - * subsidiary hashCxt or batchCxt. + * subsidiary hashCxt, batchCxt or fileCxt. */ hashtable = palloc_object(HashJoinTableData); hashtable->nbuckets = nbuckets; @@ -538,6 +538,10 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, "HashBatchContext", ALLOCSET_DEFAULT_SIZES); + hashtable->fileCxt = AllocSetContextCreate(hashtable->hashCxt, + "HashBatchFiles", + ALLOCSET_DEFAULT_SIZES); + /* Allocate data that will live for the life of the hashjoin */ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); @@ -570,15 +574,21 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, if (nbatch > 1 && hashtable->parallel_state == NULL) { + MemoryContext oldctx; + /* * allocate and initialize the file arrays in hashCxt (not needed for * parallel case which uses shared tuplestores instead of raw files) */ + oldctx = MemoryContextSwitchTo(hashtable->fileCxt); + hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch); hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch); /* The files will not be opened until needed... */ /* ... but make sure we have temp tablespaces established for them */ PrepareTempTablespaces(); + + MemoryContextSwitchTo(oldctx); } MemoryContextSwitchTo(oldcxt); @@ -913,7 +923,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) int oldnbatch = hashtable->nbatch; int curbatch = hashtable->curbatch; int nbatch; - MemoryContext oldcxt; long ninmemory; long nfreed; HashMemoryChunk oldchunks; @@ -934,13 +943,16 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) hashtable, nbatch, hashtable->spaceUsed); #endif - oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); - if (hashtable->innerBatchFile == NULL) { + MemoryContext oldcxt = MemoryContextSwitchTo(hashtable->fileCxt); + /* we had no file arrays before */ hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch); hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch); + + MemoryContextSwitchTo(oldcxt); + /* time to establish the temp tablespaces, too */ PrepareTempTablespaces(); } @@ -951,8 +963,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) hashtable->outerBatchFile = repalloc0_array(hashtable->outerBatchFile, BufFile *, oldnbatch, nbatch); } - MemoryContextSwitchTo(oldcxt); - hashtable->nbatch = nbatch; /* @@ -1022,9 +1032,11 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) { /* dump it out */ Assert(batchno > curbatch); + ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple), hashTuple->hashvalue, - &hashtable->innerBatchFile[batchno]); + &hashtable->innerBatchFile[batchno], + hashtable->fileCxt); hashtable->spaceUsed -= hashTupleSize; nfreed++; @@ -1681,9 +1693,11 @@ ExecHashTableInsert(HashJoinTable hashtable, * put the tuple into a temp file for later batches */ Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(tuple, hashvalue, - &hashtable->innerBatchFile[batchno]); + &hashtable->innerBatchFile[batchno], + hashtable->fileCxt); } if (shouldFree) @@ -2663,8 +2677,11 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable) { /* Put the tuple into a temp file for later batches */ Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(tuple, hashvalue, - &hashtable->innerBatchFile[batchno]); + &hashtable->innerBatchFile[batchno], + hashtable->fileCxt); + pfree(hashTuple); hashtable->spaceUsed -= tupleSize; hashtable->spaceUsedSkew -= tupleSize; diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 5454afbff2..5bc7f814c6 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -485,8 +485,10 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) */ Assert(parallel_state == NULL); Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(mintuple, hashvalue, - &hashtable->outerBatchFile[batchno]); + &hashtable->outerBatchFile[batchno], + hashtable->fileCxt); if (shouldFree) heap_free_minimal_tuple(mintuple); @@ -1297,21 +1299,27 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) * The data recorded in the file for each tuple is its hash value, * then the tuple in MinimalTuple format. * - * Note: it is important always to call this in the regular executor - * context, not in a shorter-lived context; else the temp file buffers - * will get messed up. + * Note: it is important always to call this in the HashBatchFiles context, + * not in a shorter-lived context; else the temp file buffers will get messed + * up. */ void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, - BufFile **fileptr) + BufFile **fileptr, MemoryContext filecxt) { BufFile *file = *fileptr; if (file == NULL) { + MemoryContext oldctx; + + oldctx = MemoryContextSwitchTo(filecxt); + /* First write to this batch file, so open it. */ file = BufFileCreateTemp(false); *fileptr = file; + + MemoryContextSwitchTo(oldctx); } BufFileWrite(file, &hashvalue, sizeof(uint32)); diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 8ee59d2c71..74867c3e40 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -25,10 +25,14 @@ * * Each active hashjoin has a HashJoinTable control block, which is * palloc'd in the executor's per-query context. All other storage needed - * for the hashjoin is kept in private memory contexts, two for each hashjoin. + * for the hashjoin is kept in private memory contexts, three for each + * hashjoin: + * - HashTableContext (hashCxt): the control block associated to the hash table + * - HashBatchContext (batchCxt): storages for batches + * - HashBatchFiles (fileCxt): storage for temp files buffers + * * This makes it easy and fast to release the storage when we don't need it - * anymore. (Exception: data associated with the temp files lives in the - * per-query context too, since we always call buffile.c in that context.) + * anymore. * * The hashtable contexts are made children of the per-query context, ensuring * that they will be discarded at end of statement even if the join is @@ -39,6 +43,10 @@ * "hashCxt", while storage that is only wanted for the current batch is * allocated in the "batchCxt". By resetting the batchCxt at the end of * each batch, we free all the per-batch storage reliably and without tedium. + * Note that data associated with the temp files lives in the "fileCxt" context + * which lives during the entire join as temp files might need to survives + * batches. These files are explicitly destroyed by calling BufFileClose() + * when the code is done with them. * * During first scan of inner relation, we get its tuples from executor. * If nbatch > 1 then tuples that don't belong in first batch get saved @@ -350,6 +358,7 @@ typedef struct HashJoinTableData MemoryContext hashCxt; /* context for whole-hash-join storage */ MemoryContext batchCxt; /* context for this-batch-only storage */ + MemoryContext fileCxt; /* context for the BufFile related storage */ /* used for dense allocation of tuples (into linked chunks) */ HashMemoryChunk chunks; /* one list for the whole batch */ diff --git a/src/include/executor/nodeHashjoin.h b/src/include/executor/nodeHashjoin.h index d367070883..a8f9ae1989 100644 --- a/src/include/executor/nodeHashjoin.h +++ b/src/include/executor/nodeHashjoin.h @@ -29,6 +29,6 @@ extern void ExecHashJoinInitializeWorker(HashJoinState *state, ParallelWorkerContext *pwcxt); extern void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, - BufFile **fileptr); + BufFile **fileptr, MemoryContext filecxt); #endif /* NODEHASHJOIN_H */ -- 2.39.2