From 3dcfa20437c7febf65ef7ad7dbc1a3c4f0bf6980 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Tue, 18 Feb 2025 15:59:13 +1300 Subject: [PATCH v2.8 11/38] Improve read_stream.c advice for big random chunks. read_stream.c tries not to issue readahead advice when it thinks the kernel's own readahead should be active, ie when using buffered I/O and reading sequential blocks. It previously gave up too easily, and issued advice only for the first read of size up to io_combine_limit in a larger sequential range of blocks after a random jump. The following read could suffer an avoidable I/O stall. Fix, by issuing advice until the corresponding pread() calls catch up with the start of the region we're currently issuing advice for, if ever. That's when the kernel actually sees the sequential pattern and has any chance of helping. That won't happen until the sequential region fills the whole look-ahead window, so advice is now much more aggressive, while still disabled for purely sequential streams. While refactoring the advice logic, also get rid of the suppress_advice argument that was passed around between functions, since read_stream_start_pending_read() can make that determination itself, and it's better to keep all that logic in one place. Revealed by Tomas Vondra's disk access pattern tests with Melanie Plageman's Bitmap Heap Scan patch. Reviewed-by: Andres Freund (earlier version) Reported-by: Melanie Plageman Reported-by: Tomas Vondra Reported-by: Andres Freund Tested-by: Melanie Plageman Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com Discussion: https://postgr.es/m/CA%2BhUKGJ3HSWciQCz8ekP1Zn7N213RfA4nbuotQawfpq23%2Bw-5Q%40mail.gmail.com --- src/backend/storage/aio/read_stream.c | 71 +++++++++++++++++++-------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 175f8410baf..f991373359a 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -133,6 +133,7 @@ struct ReadStream /* Next expected block, for detecting sequential access. */ BlockNumber seq_blocknum; + BlockNumber seq_until_processed; /* The read operation we are currently preparing. */ BlockNumber pending_read_blocknum; @@ -238,11 +239,11 @@ read_stream_unget_block(ReadStream *stream, BlockNumber blocknum) * distance to a level that prevents look-ahead until buffers are released. */ static bool -read_stream_start_pending_read(ReadStream *stream, bool suppress_advice) +read_stream_start_pending_read(ReadStream *stream) { bool need_wait; int nblocks; - int flags; + int flags = 0; int16 io_index; int16 overflow; int16 buffer_index; @@ -262,16 +263,36 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice) else Assert(stream->next_buffer_index == stream->oldest_buffer_index); - /* - * If advice hasn't been suppressed, this system supports it, and this - * isn't a strictly sequential pattern, then we'll issue advice. - */ - if (!suppress_advice && - stream->advice_enabled && - stream->pending_read_blocknum != stream->seq_blocknum) - flags = READ_BUFFERS_ISSUE_ADVICE; - else - flags = 0; + /* Do we need to issue read-ahead advice? */ + if (stream->advice_enabled) + { + bool ahead; + + /* + * We only issue advice if we're actually far enough ahead that we + * won't immediately have to call WaitReadBuffers(). + */ + ahead = stream->pinned_buffers > 0 || + stream->pending_read_nblocks < stream->distance; + + if (stream->pending_read_blocknum == stream->seq_blocknum) + { + /* + * Sequential: issue advice only until the WaitReadBuffers() calls + * catch up with the first advice issued for this sequential + * region, so the kernel can see sequential access. + */ + if (stream->seq_until_processed != InvalidBlockNumber && ahead) + flags = READ_BUFFERS_ISSUE_ADVICE; + } + else + { + /* Random jump: start tracking new region. */ + stream->seq_until_processed = stream->pending_read_blocknum; + if (ahead) + flags = READ_BUFFERS_ISSUE_ADVICE; + } + } /* How many more buffers is this backend allowed? */ if (stream->temporary) @@ -360,7 +381,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice) } static void -read_stream_look_ahead(ReadStream *stream, bool suppress_advice) +read_stream_look_ahead(ReadStream *stream) { while (stream->ios_in_progress < stream->max_ios && stream->pinned_buffers + stream->pending_read_nblocks < stream->distance) @@ -371,8 +392,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice) if (stream->pending_read_nblocks == stream->io_combine_limit) { - read_stream_start_pending_read(stream, suppress_advice); - suppress_advice = false; + read_stream_start_pending_read(stream); continue; } @@ -405,15 +425,13 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice) /* We have to start the pending read before we can build another. */ while (stream->pending_read_nblocks > 0) { - if (!read_stream_start_pending_read(stream, suppress_advice) || + if (!read_stream_start_pending_read(stream) || stream->ios_in_progress == stream->max_ios) { /* We've hit the buffer or I/O limit. Rewind and stop here. */ read_stream_unget_block(stream, blocknum); return; } - - suppress_advice = false; } /* This is the start of a new pending read. */ @@ -437,7 +455,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice) stream->pinned_buffers == 0) || stream->distance == 0) && stream->ios_in_progress < stream->max_ios) - read_stream_start_pending_read(stream, suppress_advice); + read_stream_start_pending_read(stream); /* * There should always be something pinned when we leave this function, @@ -613,6 +631,8 @@ read_stream_begin_impl(int flags, stream->callback = callback; stream->callback_private_data = callback_private_data; stream->buffered_blocknum = InvalidBlockNumber; + stream->seq_blocknum = InvalidBlockNumber; + stream->seq_until_processed = InvalidBlockNumber; stream->temporary = SmgrIsTemp(smgr); /* @@ -793,7 +813,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) * space for more, but if we're just starting up we'll need to crank * the handle to get started. */ - read_stream_look_ahead(stream, true); + read_stream_look_ahead(stream); /* End of stream reached? */ if (stream->pinned_buffers == 0) @@ -838,6 +858,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) distance = stream->distance * 2; distance = Min(distance, stream->max_pinned_buffers); stream->distance = distance; + + /* + * If we've caught up with the first advice issued for the current + * sequential region, cancel further advice until the next random + * jump. The kernel should be able to see the pattern now that + * we're actually making sequential preadv() calls. + */ + if (stream->ios[io_index].op.blocknum == stream->seq_until_processed) + stream->seq_until_processed = InvalidBlockNumber; } else { @@ -899,7 +928,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) stream->oldest_buffer_index = 0; /* Prepare for the next call. */ - read_stream_look_ahead(stream, false); + read_stream_look_ahead(stream); #ifndef READ_STREAM_DISABLE_FAST_PATH /* See if we can take the fast path for all-cached scans next time. */ -- 2.48.1.76.g4e746b1a31.dirty