From 2a1de622d4a83b940fba3ad6f9f1d1af334d2552 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 29 Nov 2023 15:09:17 +0100 Subject: [PATCH v6 2/3] Reduce de-/forming of BRIN tuples in parallel BRIN build De-/forming of the BRIN tuple is actually quite expensive, so in this commit we introduce a mechanism where we don't do that if only one BRIN tuple is stored in the shared sort by moving the serialized tuple into the local context, and only deserialize it once we need to use the memtuple. --- src/backend/access/brin/brin.c | 115 ++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 45 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 001cf04aac..e7ec7c7b53 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2471,6 +2471,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) int i; BrinTuple *btup; BrinMemTuple *memtuple = NULL; + BrinMemTuple *memtup_holder = NULL; Size tuplen; BrinShared *brinshared = brinleader->brinshared; BlockNumber prevblkno = InvalidBlockNumber; @@ -2479,6 +2480,8 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) BrinSpool *spool; MemoryContext rangeCxt, oldCxt; + BrinTuple *prevbtup = NULL; + Size prevtuplen; /* Shutdown worker processes */ WaitForParallelWorkersToFinish(brinleader->pcxt); @@ -2498,7 +2501,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) * Initialize BrinMemTuple we'll use to union summaries from workers (in * case they happened to produce parts of the same paga range). */ - memtuple = brin_new_memtuple(state->bs_bdesc); + memtup_holder = brin_new_memtuple(state->bs_bdesc); /* * Create a memory context we'll reset to combine results for a single @@ -2526,9 +2529,10 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) /* * Do we need to union summaries for the same page range? * - * If this is the first brin tuple we read, then just deform it into - * the memtuple, and continue with the next one from tuplesort. We - * however may need to insert empty summaries into the index. + * If this is the first brin tuple we read, then cache it and continue + * with the next one from tuplesort. We however may need to insert + * empty summaries into the index for the page ranges < the first + * page. * * If it's the same block as the last we saw, we simply union the brin * tuple into it, and we're done - we don't even need to insert empty @@ -2539,20 +2543,41 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) * page range, we need to do the insert and then deform the tuple into * the memtuple. Then we'll insert empty ranges before the new brin * tuple, if needed. + * + * Because it is likely we only get one tuple for each page range, we + * only deform the tuple when we actually need to merge the data. This + * means we'll only deform the tuple if we detect duplicates, and the + * deformed tuple is stored as memtuple. `memtuple_holder` is the + * reference to the BRIN memory tuple, memtuple refers to the same + * tuple if and when in use for merging data. + * + * When writing data to disk, we take care to use the cached copy if + * memtuple is not set, otherwise we form a new tuple and write that + * out to disk. */ if (prevblkno == InvalidBlockNumber) { - /* First brin tuples, just deform into memtuple. */ - memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple); + /* First brin tuple, store it in the local context. */ + prevbtup = palloc0(tuplen); + memcpy(prevbtup, btup, tuplen); + prevtuplen = tuplen; /* continue to insert empty pages before thisblock */ } - else if (memtuple->bt_blkno == btup->bt_blkno) + else if (prevbtup->bt_blkno == btup->bt_blkno) { /* * Not the first brin tuple, but same page range as the previous - * one, so we can merge it into the memtuple. + * one, so we can merge it into the memtuple. If this is the first + * merge for this block number, we still have to deform the + * original tuple before we can merge them. */ + if (memtuple == NULL) + { + memtuple = brin_deform_tuple(state->bs_bdesc, prevbtup, + memtup_holder); + } + union_tuples(state->bs_bdesc, memtuple, btup); continue; } @@ -2563,23 +2588,41 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) /* * We got brin tuple for a different page range, so form a brin - * tuple from the memtuple, insert it, and re-init the memtuple - * from the new brin tuple. + * tuple from the memtuple (if required), insert it, and store a + * copy of the new tuple for future use. */ - tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno, - memtuple, &len); + if (memtuple != NULL) + { + tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno, + memtuple, &len); + } + else + { + tmp = prevbtup; + len = prevtuplen; + } brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len); /* * Reset the per-output-range context. This frees all the memory - * possibly allocated by the union functions, and also the BRIN - * tuple we just formed and inserted. + * possibly allocated by the union functions, the previous cached + * btuple, and the btup we just may have formed from the memtuple. */ MemoryContextReset(rangeCxt); - memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple); + /* clear any potential pointers to the cleared context */ + memtuple = NULL; + prevbtup = NULL; + + /* + * Prepare for the next iteration by storing the new range's tuple + * in the current context, and resetting memtuple. + */ + prevbtup = palloc0(tuplen); + memcpy(prevbtup, btup, tuplen); + prevtuplen = tuplen; /* continue to insert empty pages before thisblock */ } @@ -2614,44 +2657,26 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) tuplesort_end(spool->sortstate); - /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */ - prevblkno = (prevblkno == InvalidBlockNumber) ? 0 : prevblkno; - while (prevblkno + state->bs_pagesPerRange < memtuple->bt_blkno) + /* + * If we had any result ranges, then we need to finish writing the cached + * data. + */ + if (prevbtup != NULL) { - /* the missing range */ - prevblkno += state->bs_pagesPerRange; - - /* Did we already build the empty range? If not, do it now. */ - if (emptyTuple == NULL) - { - BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc); + BrinTuple *tmp; + Size len; - emptyTuple = brin_form_tuple(state->bs_bdesc, prevblkno, dtuple, &emptySize); - } + if (memtuple != NULL) + tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno, + memtuple, &len); else { - /* we already have am "empty range" tuple, just set the block */ - emptyTuple->bt_blkno = prevblkno; + tmp = prevbtup; + len = prevtuplen; } - brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, - &state->bs_currentInsertBuf, - emptyTuple->bt_blkno, emptyTuple, emptySize); - } - - /* Fill the BRIN tuple for the last page range. */ - if (prevblkno != InvalidBlockNumber) - { - BrinTuple *tmp; - Size len; - - tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno, - memtuple, &len); - brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len); - - pfree(tmp); } /* -- 2.40.1