From 0012baab70779f5fc06c8717392dc76e8f156270 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 31 Oct 2022 15:24:29 +0700 Subject: [PATCH v1 1/2] Partly remap the .text segment into huge pages at postmaster start Based on MIT licensed libary at https://github.com/intel/iodlr The basic steps are: - read ELF info to get the start/end addresses of the .text segment - calculate addresses therein aligned at huge page boundaries - mmap temporary region and memcpy aligned portion of .text segment - mmap start address to new region with huge pages and MAP_FIXED - memcpy over and revoke the PROT_WRITE bit The Postgres .text segment is ~5.0MB in a non-assert build, so this method can put 2-4MB into huge pages. --- src/backend/port/large_page.c | 348 ++++++++++++++++++++++++++++ src/backend/port/meson.build | 1 + src/backend/postmaster/postmaster.c | 7 + src/include/port/large_page.h | 18 ++ 4 files changed, 374 insertions(+) create mode 100644 src/backend/port/large_page.c create mode 100644 src/include/port/large_page.h diff --git a/src/backend/port/large_page.c b/src/backend/port/large_page.c new file mode 100644 index 0000000000..66a584f785 --- /dev/null +++ b/src/backend/port/large_page.c @@ -0,0 +1,348 @@ +/*------------------------------------------------------------------------- + * + * large_page.c + * Map .text segment of binary to huge pages + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/port/large_page.c + * + *------------------------------------------------------------------------- + */ + +/* + * Based on Intel ioldr library: + * https://github.com/intel/iodlr.git + * MIT license and copyright notice follows + */ + +/* + * Copyright (C) 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES + * OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE + * OR OTHER DEALINGS IN THE SOFTWARE. + * + * SPDX-License-Identifier: MIT + */ + +#include "postgres.h" + +#include +#include + +#include "port/large_page.h" +#include "storage/pg_shmem.h" + +typedef struct +{ + char *from; + char *to; +} mem_range; + +typedef struct +{ + uintptr_t start; + uintptr_t end; + bool found; +} FindParams; + +static inline uintptr_t +largepage_align_down(uintptr_t addr, size_t hugepagesize) +{ + return (addr & ~(hugepagesize - 1)); +} + +static inline uintptr_t +largepage_align_up(uintptr_t addr, size_t hugepagesize) +{ + return largepage_align_down(addr + hugepagesize - 1, hugepagesize); +} + +static bool +FindTextSection(const char *fname, ElfW(Shdr) * text_section) +{ + ElfW(Ehdr) ehdr; + FILE *bin; + + ElfW(Shdr) * shdrs = NULL; + ElfW(Shdr) * sh_strab; + char *section_names = NULL; + + bin = fopen(fname, "r"); + if (bin == NULL) + return false; + + /* Read the header. */ + if (fread(&ehdr, sizeof(ehdr), 1, bin) != 1) + return false;; + + /* Read the section headers. */ + shdrs = (ElfW(Shdr) *) palloc(ehdr.e_shnum * sizeof(ElfW(Shdr))); + if (fseek(bin, ehdr.e_shoff, SEEK_SET) != 0) + return false;; + if (fread(shdrs, sizeof(shdrs[0]), ehdr.e_shnum, bin) != ehdr.e_shnum) + return false;; + + /* Read the string table. */ + sh_strab = &shdrs[ehdr.e_shstrndx]; + section_names = palloc(sh_strab->sh_size * sizeof(char)); + + if (fseek(bin, sh_strab->sh_offset, SEEK_SET) != 0) + return false;; + if (fread(section_names, sh_strab->sh_size, 1, bin) != 1) + return false;; + + /* Find the ".text" section. */ + for (uint32_t idx = 0; idx < ehdr.e_shnum; idx++) + { + ElfW(Shdr) * sh = &shdrs[idx]; + if (!memcmp(§ion_names[sh->sh_name], ".text", 5)) + { + *text_section = *sh; + fclose(bin); + return true; + } + } + return false; +} + +/* Callback for dl_iterate_phdr to set the start and end of the .text segment */ +static int +FindMapping(struct dl_phdr_info *hdr, size_t size, void *data) +{ + ElfW(Shdr) text_section; + FindParams *find_params = (FindParams *) data; + + /* + * We are only interested in the mapping matching the main executable. + * This has the empty string for a name. + */ + if (hdr->dlpi_name[0] != '\0') + return 0; + + /* + * Open the info structure for the executable on disk to find the location + * of its .text section. We use the base address given to calculate the + * .text section offset in memory. + */ + text_section.sh_size = 0; +#ifdef __linux__ + if (FindTextSection("/proc/self/exe", &text_section)) + { + find_params->start = hdr->dlpi_addr + text_section.sh_addr; + find_params->end = find_params->start + text_section.sh_size; + find_params->found = true; + return 1; + } +#endif + return 0; +} + +/* + * Identify and return the text segment in the currently mapped memory region. + */ +static bool +FindTextRegion(mem_range * region) +{ + FindParams find_params = {0, 0, false}; + + /* + * Note: the upstream source worked with shared libraries as well, hence + * the iteration over all ojects. + */ + dl_iterate_phdr(FindMapping, &find_params); + if (find_params.found) + { + region->from = (char *) find_params.start; + region->to = (char *) find_params.end; + } + + return find_params.found; +} + +/* + * Move specified region to large pages. + * + * NB: We need to be very careful: + * 1. This function itself should not be moved. We use compiler attributes: + * WIP: if these aren't available, the function should do nothing + * (__section__) to put it outside the ".text" section + * (__noline__) to not inline this function + * + * 2. This function should not call any function(s) that might be moved. + */ +static void +__attribute__((__section__("lpstub"))) +__attribute__((__noinline__)) +MoveRegionToLargePages(const mem_range * r, int mmap_flags) +{ + void *nmem = MAP_FAILED; + void *tmem = MAP_FAILED; + int ret = 0; + int mmap_errno = 0; + void *start = r->from; + size_t size = r->to - r->from; + bool success = false; + + /* Allocate temporary region */ + nmem = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (nmem == MAP_FAILED) + { + elog(DEBUG1, "failed to allocate temporary region"); + return; + } + + /* copy the original code */ + memcpy(nmem, r->from, size); + + /* + * mmap using the start address with MAP_FIXED so we get exactly the same + * virtual address. We already know the original page is r-xp (PROT_READ, + * PROT_EXEC, MAP_PRIVATE) We want PROT_WRITE because we are writing into + * it. + */ + Assert(mmap_flags & MAP_HUGETLB); + tmem = mmap(start, size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | mmap_flags, + -1, 0); + mmap_errno = errno; + + if (tmem == MAP_FAILED && huge_pages == HUGE_PAGES_ON) + { + /* + * WIP: need a way for the user to determine total huge pages needed, + * perhaps with shared_memory_size_in_huge_pages + */ + errno = mmap_errno; + ereport(FATAL, + errmsg("mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", size), + (mmap_errno == ENOMEM) ? + errhint("This usually means not enough explicit huge pages were " + "configured in the kernel") : 0); + goto cleanup_tmp; + } + else if (tmem == MAP_FAILED) + { + Assert(huge_pages == HUGE_PAGES_TRY); + + errno = mmap_errno; + elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", size); + + /* + * try remapping again with normal pages + * + * XXX we cannot just back out now + */ + tmem = mmap(start, size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + -1, 0); + mmap_errno = errno; + + if (tmem == MAP_FAILED) + { + /* + * If we get here we cannot start the server. It's unlikely we + * will fail here after the postmaster successfully set up shared + * memory, but maybe we should have a GUC to turn off code + * remapping, hinted here. + */ + errno = mmap_errno; + ereport(FATAL, + errmsg("mmap(%zu) failed for fallback code region: %m", size)); + goto cleanup_tmp; + } + } + else + success = true; + + /* copy the code to the newly mapped area and unset the write bit */ + memcpy(start, nmem, size); + ret = mprotect(start, size, PROT_READ | PROT_EXEC); + if (ret < 0) + { + /* WIP: see note above about GUC and hint */ + ereport(FATAL, + errmsg("failed to protect remapped code pages")); + + /* Cannot start but at least try to clean up after ourselves */ + munmap(tmem, size); + goto cleanup_tmp; + } + + if (success) + elog(DEBUG1, "binary mapped to huge pages"); + +cleanup_tmp: + /* Release the old/temporary mapped region */ + elog(DEBUG3, "un-mmapping temporary code region"); + ret = munmap(nmem, size); + if (ret < 0) + /* WIP: not sure of severity here */ + ereport(LOG, + errmsg("failed to unmap temporary region")); + + return; +} + +/* Align the region to to be mapped to huge page boundaries. */ +static void +AlignRegionToPageBoundary(mem_range * r, size_t hugepagesize) +{ + r->from = (char *) largepage_align_up((uintptr_t) r->from, hugepagesize); + r->to = (char *) largepage_align_down((uintptr_t) r->to, hugepagesize); +} + + +/* Map the postgres .text segment into huge pages. */ +void +MapStaticCodeToLargePages(void) +{ + size_t hugepagesize; + int mmap_flags; + mem_range r = {0}; + + if (huge_pages == HUGE_PAGES_OFF) + return; + + GetHugePageSize(&hugepagesize, &mmap_flags); + if (hugepagesize == 0) + return; + + FindTextRegion(&r); + if (r.from == NULL || r.to == NULL) + return; + + elog(DEBUG3, ".text start: %p", r.from); + elog(DEBUG3, ".text end: %p", r.to); + + AlignRegionToPageBoundary(&r, hugepagesize); + + elog(DEBUG3, "aligned .text start: %p", r.from); + elog(DEBUG3, "aligned .text end: %p", r.to); + + /* check if aligned map region is large enough for huge pages */ + if (r.to - r.from < hugepagesize || r.from > r.to) + return; + + MoveRegionToLargePages(&r, mmap_flags); +} diff --git a/src/backend/port/meson.build b/src/backend/port/meson.build index a22c25dd95..5ab65115e9 100644 --- a/src/backend/port/meson.build +++ b/src/backend/port/meson.build @@ -16,6 +16,7 @@ if cdata.has('USE_WIN32_SEMAPHORES') endif if cdata.has('USE_SYSV_SHARED_MEMORY') + backend_sources += files('large_page.c') backend_sources += files('sysv_shmem.c') endif diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 30fb576ac3..b30769c2b2 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -106,6 +106,7 @@ #include "pg_getopt.h" #include "pgstat.h" #include "port/pg_bswap.h" +#include "port/large_page.h" #include "postmaster/autovacuum.h" #include "postmaster/auxprocess.h" #include "postmaster/bgworker_internals.h" @@ -1084,6 +1085,12 @@ PostmasterMain(int argc, char *argv[]) */ CreateSharedMemoryAndSemaphores(); + /* + * If enough huge pages are available after setting up shared memory, try + * to map the binary code to huge pages. + */ + MapStaticCodeToLargePages(); + /* * Estimate number of openable files. This must happen after setting up * semaphores, because on some platforms semaphores count as open files. diff --git a/src/include/port/large_page.h b/src/include/port/large_page.h new file mode 100644 index 0000000000..171819dd53 --- /dev/null +++ b/src/include/port/large_page.h @@ -0,0 +1,18 @@ +/*------------------------------------------------------------------------- + * + * large_page.h + * Map .text segment of binary to huge pages + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/large_page.h + * + *------------------------------------------------------------------------- + */ +#ifndef LARGE_PAGE_H +#define LARGE_PAGE_H + +extern void MapStaticCodeToLargePages(void); + +#endif /* LARGE_PAGE_H */ -- 2.37.3