From e03af5726957437c15361bdb1b373fe8982f5c7c Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Wed, 10 Jan 2024 14:12:17 +0000 Subject: [PATCH v19] Allow WAL reading from WAL buffers This commit adds postgres the capability to read WAL from WAL buffers. When requested WAL isn't available in WAL buffers, the WAL is read from the WAL file as usual. This commit benefits the callers of WALRead(), that are walsenders, pg_walinspect etc. They all can now avoid reading WAL from the WAL file (possibly avoiding disk IO). Tests show that the WAL buffers hit ratio stood at 95% for 1 primary, 1 sync standby, 1 async standby, with pgbench --scale=300 --client=32 --time=900. In other words, the walsenders avoided 95% of the time reading from the file/avoided pread system calls: https://www.postgresql.org/message-id/CALj2ACXKKK%3DwbiG5_t6dGao5GoecMwRkhr7GjVBM_jg54%2BNa%3DQ%40mail.gmail.com This commit also benefits when direct IO is enabled for WAL. Reading WAL from WAL buffers puts back the performance close to that of without direct IO for WAL: https://www.postgresql.org/message-id/CALj2ACV6rS%2B7iZx5%2BoAvyXJaN4AG-djAQeM1mrM%3DYSDkVrUs7g%40mail.gmail.com This commit paves the way for the following features in future: - Improves synchronous replication performance by replicating directly from WAL buffers. - A opt-in way for the walreceivers to receive unflushed WAL. More details here: https://www.postgresql.org/message-id/20231011224353.cl7c2s222dw3de4j%40awork3.anarazel.de Author: Bharath Rupireddy Reviewed-by: Dilip Kumar, Andres Freund Reviewed-by: Nathan Bossart, Kuntal Ghosh Discussion: https://www.postgresql.org/message-id/CALj2ACXKKK%3DwbiG5_t6dGao5GoecMwRkhr7GjVBM_jg54%2BNa%3DQ%40mail.gmail.com --- src/backend/access/transam/xlog.c | 173 ++++++++++++++++++++++++ src/backend/access/transam/xlogreader.c | 40 +++++- src/backend/access/transam/xlogutils.c | 11 +- src/backend/postmaster/walsummarizer.c | 10 +- src/backend/replication/walsender.c | 10 +- src/include/access/xlog.h | 3 + 6 files changed, 231 insertions(+), 16 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 478377c4a2..886eaf12e3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1705,6 +1705,179 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) return cachedPos + ptr % XLOG_BLCKSZ; } +/* + * Read WAL from WAL buffers. + * + * This function reads 'count' bytes of WAL from WAL buffers into 'buf' + * starting at location 'startptr' on timeline 'tli' and returns total bytes + * read. + * + * Points to note: + * + * - This function reads as much as it can from WAL buffers, meaning, it may + * not read all the requested 'count' bytes. Caller must be aware of this and + * deal with it. + * + * - This function reads WAL from WAL buffers without holding any lock. First + * it reads xlblocks atomically for checking page existence, then it reads the + * page contents and validates. Finally, it rechecks the page existence by + * re-reading xlblocks; if the read page is replaced, it discards it and + * returns. + * + * - This function is not available for frontend code as WAL buffers are + * internal to the server. + * + * - This function waits for any in-progress WAL insertions to WAL buffers to + * finish. + */ +Size +XLogReadFromBuffers(XLogRecPtr startptr, TimeLineID tli, Size count, + char *buf) +{ + XLogRecPtr ptr; + Size nbytes; + Size ntotal = 0; + char *dst; + uint64 bytepos; + XLogRecPtr reservedUpto; + XLogwrtResult LogwrtResult; + + /* + * Fast paths for the following reasons: 1) WAL buffers aren't in use when + * server is in recovery. 2) WAL is inserted into WAL buffers on current + * server's insertion TLI. 3) Invalid starting WAL location. + */ + if (RecoveryInProgress() || + tli != GetWALInsertionTimeLine() || + XLogRecPtrIsInvalid(startptr)) + return ntotal; + + /* Read the current insert position */ + SpinLockAcquire(&XLogCtl->Insert.insertpos_lck); + bytepos = XLogCtl->Insert.CurrBytePos; + SpinLockRelease(&XLogCtl->Insert.insertpos_lck); + + reservedUpto = XLogBytePosToEndRecPtr(bytepos); + + /* + * WAL being read doesn't yet exist i.e. past the current insert position. + */ + if ((startptr + count) > reservedUpto) + return ntotal; + + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* Wait for any in-progress WAL insertions to WAL buffers to finish. */ + if ((startptr + count) > LogwrtResult.Write && + (startptr + count) <= reservedUpto) + WaitXLogInsertionsToFinish(startptr + count); + + ptr = startptr; + nbytes = count; + dst = buf; + + while (nbytes > 0) + { + XLogRecPtr expectedEndPtr; + XLogRecPtr endptr; + int idx; + char *page; + char *data; + Size nread; + XLogPageHeader phdr; + + idx = XLogRecPtrToBufIdx(ptr); + expectedEndPtr = ptr; + expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ; + endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); + + /* Requested WAL isn't available in WAL buffers. */ + if (expectedEndPtr != endptr) + break; + + /* + * We found WAL buffer page containing given XLogRecPtr. Get starting + * address of the page and a pointer to the right location of given + * XLogRecPtr in that page. + */ + page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; + data = page + ptr % XLOG_BLCKSZ; + + /* + * Make sure we don't read xlblocks up above before the page contents + * down below. + */ + pg_read_barrier(); + + nread = 0; + + /* Read what is wanted, not the whole page. */ + if ((data + nbytes) <= (page + XLOG_BLCKSZ)) + { + /* All the bytes are in one page. */ + nread = nbytes; + } + else + { + /* + * All the bytes are not in one page. Read available bytes on the + * current page, copy them over to output buffer and continue to + * read remaining bytes. + */ + nread = XLOG_BLCKSZ - (data - page); + Assert(nread > 0 && nread <= nbytes); + } + + Assert(nread > 0); + memcpy(dst, data, nread); + + /* + * Make sure we don't read xlblocks down below before the page + * contents up above. + */ + pg_read_barrier(); + + /* Recheck if the read page still exists in WAL buffers. */ + endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); + + /* Return if the page got initalized while we were reading it. */ + if (expectedEndPtr != endptr) + break; + + /* + * Typically, we must not read a WAL buffer page that just got + * initialized. Because we waited enough for the in-progress WAL + * insertions to finish above. However, there can exist a slight + * window after the above wait finishes in which the read buffer page + * can get replaced especially under high WAL generation rates. After + * all, we are reading from WAL buffers without any locks here. So, + * let's not count such a page in. + */ + phdr = (XLogPageHeader) page; + if (!(phdr->xlp_magic == XLOG_PAGE_MAGIC && + phdr->xlp_pageaddr == (ptr - (ptr % XLOG_BLCKSZ)) && + phdr->xlp_tli == tli)) + break; + + dst += nread; + ptr += nread; + ntotal += nread; + nbytes -= nread; + } + + /* We never read more than what the caller has asked for. */ + Assert(ntotal <= count); + + ereport(DEBUG1, + errmsg_internal("read %zu bytes out of %zu bytes from WAL buffers for given start LSN %X/%X, timeline ID %u", + ntotal, count, + LSN_FORMAT_ARGS(startptr), tli)); + + return ntotal; +} + /* * Converts a "usable byte position" to XLogRecPtr. A usable byte position * is the position starting from the beginning of WAL, excluding all WAL diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 7190156f2f..639bba2ad9 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1501,17 +1501,47 @@ err: * Returns true if succeeded, false if an error occurs, in which case * 'errinfo' receives error details. * - * XXX probably this should be improved to suck data directly from the - * WAL buffers when possible. + * When possible, this function reads WAL from WAL buffers. When requested WAL + * isn't available in WAL buffers, it is read from the WAL file as usual. */ bool -WALRead(XLogReaderState *state, - char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, - WALReadError *errinfo) +WALRead(XLogReaderState *state, char *buf, XLogRecPtr startptr, + Size count, TimeLineID tli, WALReadError *errinfo) { char *p; XLogRecPtr recptr; Size nbytes; +#ifndef FRONTEND + Size nread; +#endif + +#ifndef FRONTEND + + /* + * Try reading WAL from WAL buffers. Frontend code has no idea of WAL + * buffers. + */ + nread = XLogReadFromBuffers(startptr, tli, count, buf); + + if (nread > 0) + { + /* + * Check if its a full read, short read or no read from WAL buffers. + * For short read or no read, continue to read the remaining bytes + * from WAL file. + * + * XXX: It might be worth to expose WAL buffer read stats. + */ + if (nread == count) /* full read */ + return true; + else if (nread < count) /* short read */ + { + buf += nread; + startptr += nread; + count -= nread; + } + } +#endif p = buf; recptr = startptr; diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index aa8667abd1..fafab9aa32 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -1007,12 +1007,13 @@ read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr, } /* - * Even though we just determined how much of the page can be validly read - * as 'count', read the whole page anyway. It's guaranteed to be - * zero-padded up to the page boundary if it's incomplete. + * We determined how much of the page can be validly read as 'count', read + * that much only, not the entire page. Since WALRead() can read the page + * from WAL buffers, in which case, the page is not guaranteed to be + * zero-padded up to the page boundary because of the concurrent + * insertions. */ - if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli, - &errinfo)) + if (!WALRead(state, cur_page, targetPagePtr, count, tli, &errinfo)) WALReadRaiseError(&errinfo); /* number of valid bytes in the buffer */ diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index f828cc436a..d465848bc9 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -1254,11 +1254,13 @@ summarizer_read_local_xlog_page(XLogReaderState *state, } /* - * Even though we just determined how much of the page can be validly read - * as 'count', read the whole page anyway. It's guaranteed to be - * zero-padded up to the page boundary if it's incomplete. + * We determined how much of the page can be validly read as 'count', read + * that much only, not the entire page. Since WALRead() can read the page + * from WAL buffers, in which case, the page is not guaranteed to be + * zero-padded up to the page boundary because of the concurrent + * insertions. */ - if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, + if (!WALRead(state, cur_page, targetPagePtr, count, private_data->tli, &errinfo)) WALReadRaiseError(&errinfo); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 087031e9dc..b35406bcdf 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -1095,11 +1095,17 @@ logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int req else count = flushptr - targetPagePtr; /* part of the page available */ - /* now actually read the data, we know it's there */ + /* + * We determined how much of the page can be validly read as 'count', read + * that much only, not the entire page. Since WALRead() can read the page + * from WAL buffers, in which case, the page is not guaranteed to be + * zero-padded up to the page boundary because of the concurrent + * insertions. + */ if (!WALRead(state, cur_page, targetPagePtr, - XLOG_BLCKSZ, + count, currTLI, /* Pass the current TLI because only * WalSndSegmentOpen controls whether new TLI * is needed. */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 301c5fa11f..fa760a92d5 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -252,6 +252,9 @@ extern XLogRecPtr GetLastImportantRecPtr(void); extern void SetWalWriterSleeping(bool sleeping); +extern Size XLogReadFromBuffers(XLogRecPtr startptr, TimeLineID tli, + Size count, char *buf); + /* * Routines used by xlogrecovery.c to call back into xlog.c during recovery. */ -- 2.34.1