From f4d0c7c81c1b960e71d7f76f1262279b329eac07 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Tue, 18 Feb 2025 15:59:13 +1300 Subject: [PATCH v2 3/4] Improve read stream advice for larger random reads. read_stream.c tries not to issue advice when it thinks the kernel's readahead should be active, ie when using buffered I/O and reading sequential blocks. It previously gave up a little too easily: it should issue advice until it has started running sequential pread() calls, not just when it's planning to. The simpler strategy worked for random regions of size <= io_combine_limit and large sequential regions, but not so well when reading random regions of size > io_combine limit. For a 256kB chunk of data far away from recent access, it would issue advice for the first half (assuming io_combine_limit=128kB) and then suffer an I/O stall for the second half. Discovered by Tomas Vondra's regression testing of many data clustering patterns using Melanie Plageman's streaming Bitmap Heap Scan patch, with analysis of the I/O stall-producing pattern from Andres Freund. Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com Discussion: https://postgr.es/m/CA%2BhUKGJ3HSWciQCz8ekP1Zn7N213RfA4nbuotQawfpq23%2Bw-5Q%40mail.gmail.com --- src/backend/storage/aio/read_stream.c | 44 +++++++++++++++++++++------ 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 6c2b4ec011d..a028217a08e 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -132,6 +132,7 @@ struct ReadStream /* Next expected block, for detecting sequential access. */ BlockNumber seq_blocknum; + BlockNumber seq_start; /* The read operation we are currently preparing. */ BlockNumber pending_read_blocknum; @@ -260,16 +261,30 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice) else Assert(stream->next_buffer_index == stream->oldest_buffer_index); - /* - * If advice hasn't been suppressed, this system supports it, and this - * isn't a strictly sequential pattern, then we'll issue advice. - */ - if (!suppress_advice && - stream->advice_enabled && - stream->pending_read_blocknum != stream->seq_blocknum) + /* Do we need to issue read-ahead advice? */ + flags = 0; + if (stream->advice_enabled) + { flags = READ_BUFFERS_ISSUE_ADVICE; - else - flags = 0; + + if (stream->pending_read_blocknum == stream->seq_blocknum) + { + /* + * Suppress advice if our WaitReadBuffers() calls have caught up + * with the first advice we issued for this sequential run. + */ + if (stream->seq_start == InvalidBlockNumber) + suppress_advice = true; + } + else + { + /* Random jump, so start a new sequential run. */ + stream->seq_start = stream->pending_read_blocknum; + } + + if (suppress_advice) + flags = 0; + } /* Compute the remaining portion of the per-backend buffer limit. */ if (stream->temporary) @@ -601,6 +616,8 @@ read_stream_begin_impl(int flags, stream->callback_private_data = callback_private_data; stream->buffered_blocknum = InvalidBlockNumber; stream->temporary = SmgrIsTemp(smgr); + stream->seq_blocknum = InvalidBlockNumber; + stream->seq_start = InvalidBlockNumber; /* * Skip the initial ramp-up phase if the caller says we're going to be @@ -825,6 +842,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) distance = stream->distance * 2; distance = Min(distance, stream->max_pinned_buffers); stream->distance = distance; + + /* + * If we've caught up with the first advice issued for the current + * sequential run, cancel further advice until the next random + * jump. The kernel should be able to see the pattern now that + * we're issuing sequential preadv() calls. + */ + if (stream->ios[io_index].op.blocknum == stream->seq_start) + stream->seq_start = InvalidBlockNumber; } else { -- 2.39.5