From 389859e1a31fab7fe906176d33660583c12e4277 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 5 Jan 2017 11:58:24 -0800 Subject: [PATCH 1/2] Fix bug in logical tapeset free block management Make sure that the first write pass always writes out blocks in sequential order. The simplifications made to the logtape block format in commit 01ec2563 did not tie down all the cases in which non-sequential writes of blocks in the first write pass could happen. When non-sequential writes are performed in the first write pass, BufFile seeking indicates a problem, making logtape.c raise an error. The extra steps taken within this commit to prevent problems are applied more broadly than strictly necessary; they're only necessary in the first write pass, and not at the end of every run (round of writing to a logical tape). It doesn't seem worth bothering tuplesort.c with that detail, though. --- src/backend/utils/sort/logtape.c | 54 ++++++++++++++++++++++++++++++++------ src/backend/utils/sort/tuplesort.c | 1 + src/include/utils/logtape.h | 1 + 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index f6a9dba..e07e328 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -282,8 +282,10 @@ freeBlocks_cmp(const void *a, const void *b) /* * Select a currently unused block for writing to. * - * NB: should only be called when writer is ready to write immediately, - * to ensure that first write pass is sequential. + * NB: blocks returned from here must be written out in the order of their + * initial allocation during the first write pass. This is necessary + * because ltsWriteBlock() does not support writing beyond current end of + * file. */ static long ltsGetFreeBlock(LogicalTapeSet *lts) @@ -426,6 +428,37 @@ LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts) } /* + * Indicate that round of writing to logical tape is over. + * + * When tuplesort marks the end of a run on tape, it must call here + * immediately afterwards in order to ensure that pending dirty data is + * written immediately. + */ +void +LogicalTapeEndWriting(LogicalTapeSet *lts, int tapenum) +{ + LogicalTape *lt; + + Assert(tapenum >= 0 && tapenum < lts->nTapes); + lt = <s->tapes[tapenum]; + Assert(lt->writing); + /* Must have written to tape by now */ + Assert(lt->curBlockNumber != -1); + + /* + * Write eagerly to avoid out-of-block-order writes in first write pass. + * See comments within LogicalTapeWrite(). Note that we don't unset the + * dirty flag here to avoid bothering LogicalTapeWrite() with needing to + * consider partial block writes. + */ + if (lt->dirty) + { + TapeBlockSetNBytes(lt->buffer, lt->nbytes); + ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer); + } +} + +/* * Write to a logical tape. * * There are no error returns; we ereport() on failure. @@ -473,8 +506,12 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, } /* - * First allocate the next block, so that we can store it in the - * 'next' pointer of this block. + * Allocate the next block, so that we can store it in the + * 'next' pointer of this block. Care must be taken to ensure + * that the allocated block is written out before another call + * to ltsGetFreeBlock(), which is why caller is required to call + * LogicalTapeEndWriting() later (this writes out the block + * allocated on last call here for tape's round of writes). */ nextBlockNumber = ltsGetFreeBlock(lts); @@ -557,6 +594,7 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size) { TapeBlockSetNBytes(lt->buffer, lt->nbytes); ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer); + lt->dirty = false; } lt->writing = false; } @@ -591,9 +629,9 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size) * Rewind logical tape and switch from reading to writing. * * NOTE: we assume the caller has read the tape to the end; otherwise - * untouched data and indirect blocks will not have been freed. We - * could add more code to free any unread blocks, but in current usage - * of this module it'd be useless code. + * untouched data will not have been freed. We could add more code to + * free any unread blocks, but in current usage of this module it'd be + * useless code. */ void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum) @@ -686,7 +724,7 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum) { TapeBlockSetNBytes(lt->buffer, lt->nbytes); ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer); - lt->writing = false; + lt->dirty = false; } lt->writing = false; lt->frozen = true; diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index cbaf009..c166e5d 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -3542,6 +3542,7 @@ markrunend(Tuplesortstate *state, int tapenum) unsigned int len = 0; LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); + LogicalTapeEndWriting(state->tapeset, tapenum); } /* diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h index e802c4b..54e3542 100644 --- a/src/include/utils/logtape.h +++ b/src/include/utils/logtape.h @@ -29,6 +29,7 @@ extern void LogicalTapeSetClose(LogicalTapeSet *lts); extern void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts); extern size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size); +extern void LogicalTapeEndWriting(LogicalTapeSet *lts, int tapenum); extern void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size); extern void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, -- 2.7.4