Re: Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory - Mailing list pgsql-hackers
From | Bruce Momjian |
---|---|
Subject | Re: Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory |
Date | |
Msg-id | 20130629030322.GJ13790@momjian.us Whole thread Raw |
In response to | Re: Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory (Christian Kruse <cjk+postgres@defunct.ch>) |
Responses |
Re: Re: [HACKERS] Patch für MAP_HUGETLB for mmap() shared memory
|
List | pgsql-hackers |
Did we decide against specifying huge pages in Postgres? --------------------------------------------------------------------------- On Tue, Oct 30, 2012 at 09:16:07PM +0100, Christian Kruse wrote: > Hey, > > ok, I think I implemented all of the changes you requested. All but > the ia64 dependent, I have to do more research for this one. > > > Greetings, > CK > diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml > index b4fcbaf..66ed10f 100644 > --- a/doc/src/sgml/config.sgml > +++ b/doc/src/sgml/config.sgml > @@ -1049,6 +1049,37 @@ include 'filename' > </listitem> > </varlistentry> > > + <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages"> > + <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term> > + <indexterm> > + <primary><varname>huge_tlb_pages</> configuration parameter</primary> > + </indexterm> > + <listitem> > + <para> > + Enables/disables the use of huge tlb pages. Valid values are > + <literal>on</literal>, <literal>off</literal> and <literal>try</literal>. > + The default value is <literal>try</literal>. > + </para> > + > + <para> > + With <varname>huge_tlb_pages</varname> set to <literal>on</literal> > + <symbol>mmap()</symbol> will be called with <symbol>MAP_HUGETLB</symbol>. > + If the call fails the server will fail fatally. > + </para> > + > + <para> > + With <varname>huge_tlb_pages</varname> set to <literal>off</literal> we > + will not use <symbol>MAP_HUGETLB</symbol> at all. > + </para> > + > + <para> > + With <varname>huge_tlb_pages</varname> set to <literal>try</literal> > + we will try to use <symbol>MAP_HUGETLB</symbol> and fall back to > + <symbol>mmap()</symbol> without <symbol>MAP_HUGETLB</symbol>. > + </para> > + </listitem> > + </varlistentry> > + > <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers"> > <term><varname>temp_buffers</varname> (<type>integer</type>)</term> > <indexterm> > diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c > index df06312..f9de239 100644 > --- a/src/backend/port/sysv_shmem.c > +++ b/src/backend/port/sysv_shmem.c > @@ -27,10 +27,14 @@ > #ifdef HAVE_SYS_SHM_H > #include <sys/shm.h> > #endif > +#ifdef MAP_HUGETLB > +#include <dirent.h> > +#endif > > #include "miscadmin.h" > #include "storage/ipc.h" > #include "storage/pg_shmem.h" > +#include "utils/guc.h" > > > typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ > @@ -61,6 +65,19 @@ typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ > #define MAP_FAILED ((void *) -1) > #endif > > +#ifdef MAP_HUGETLB > +# ifdef __ia64__ > +# define PG_HUGETLB_BASE_ADDR (void *)(0x8000000000000000UL) > +# define PG_MAP_HUGETLB (MAP_HUGETLB|MAP_FIXED) > +# else > +# define PG_HUGETLB_BASE_ADDR (void *)(0x0UL) > +# define PG_MAP_HUGETLB MAP_HUGETLB > +# endif > +#else > +# define PG_MAP_HUGETLB 0 > +#endif > + > + > > unsigned long UsedShmemSegID = 0; > void *UsedShmemSegAddr = NULL; > @@ -73,7 +90,6 @@ static void IpcMemoryDelete(int status, Datum shmId); > static PGShmemHeader *PGSharedMemoryAttach(IpcMemoryKey key, > IpcMemoryId *shmid); > > - > /* > * InternalIpcMemoryCreate(memKey, size) > * > @@ -342,6 +358,155 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) > } > > > +#ifdef MAP_HUGETLB > +#define HUGE_PAGE_INFO_DIR "/sys/kernel/mm/hugepages" > + > +/* > + * static long InternalGetFreeHugepagesCount(const char *name) > + * > + * Attempt to read the number of available hugepages from > + * /sys/kernel/mm/hugepages/hugepages-<size>/free_hugepages > + * Will fail (return -1) if file could not be opened, 0 if no pages are available > + * and > 0 if there are free pages > + * > + */ > +static long > +InternalGetFreeHugepagesCount(const char *name) > +{ > + int fd; > + char buff[1024]; > + size_t len; > + long result; > + char *ptr; > + > + len = snprintf(buff, 1024, "%s/%s/free_hugepages", HUGE_PAGE_INFO_DIR, name); > + if (len == 1024) /* I don't think that this will happen ever */ > + { > + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, > + (errmsg("Filename %s/%s/free_hugepages is too long", HUGE_PAGE_INFO_DIR, name), > + errcontext("while checking hugepage size"))); > + return -1; > + } > + > + fd = open(buff, O_RDONLY); > + if (fd <= 0) > + { > + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, > + (errmsg("Could not open file %s: %s", buff, strerror(errno)), > + errcontext("while checking hugepage size"))); > + return -1; > + } > + > + len = read(fd, buff, 1024); > + if (len <= 0) > + { > + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, > + (errmsg("Error reading from file %s: %s", buff, strerror(errno)), > + errcontext("while checking hugepage size"))); > + close(fd); > + return -1; > + } > + > + /* > + * If the content of free_hugepages is longer than or equal to 1024 bytes > + * the rest is irrelevant; we simply want to know if there are any > + * hugepages left > + */ > + if (len == 1024) > + { > + buff[1023] = 0; > + } > + else > + { > + buff[len] = 0; > + } > + > + close(fd); > + > + result = strtol(buff, &ptr, 10); > + > + if (ptr == NULL) > + { > + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, > + (errmsg("Could not convert contents of file %s/%s/free_hugepages to number", HUGE_PAGE_INFO_DIR, name), > + errcontext("while checking hugepage size"))); > + return -1; > + } > + > + return result; > +} > + > +/* > + * static long InternalGetHugepageSize() > + * > + * Attempt to get a valid hugepage size from /sys/kernel/mm/hugepages/ by > + * reading directory contents > + * Will fail (return -1) if the directory could not be opened or no valid > + * page sizes are available. Will return the biggest hugepage size on > + * success. > + * > + */ > +static long > +InternalGetHugepageSize() > +{ > + struct dirent *ent; > + DIR *dir = opendir(HUGE_PAGE_INFO_DIR); > + long smallest_size = -1, size; > + char *ptr; > + > + if (dir == NULL) > + { > + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, > + (errmsg("Could not open directory %s: %s", HUGE_PAGE_INFO_DIR, strerror(errno)), > + errcontext("while checking hugepage size"))); > + return -1; > + } > + > + /* > + * Linux supports multiple hugepage sizes if the hardware > + * supports it; for each possible size there will be a > + * directory in /sys/kernel/mm/hugepages consisting of the > + * string hugepages- and the size of the page, e.g. on x86_64: > + * hugepages-2048kB > + */ > + while((ent = readdir(dir)) != NULL) > + { > + if (strncmp(ent->d_name, "hugepages-", 10) == 0) > + { > + size = strtol(ent->d_name + 10, &ptr, 10); > + if (ptr == NULL) > + { > + continue; > + } > + > + if (strcmp(ptr, "kB") == 0) > + { > + size *= 1024; > + } > + > + if ((smallest_size == -1 || size < smallest_size) > + && InternalGetFreeHugepagesCount(ent->d_name) > 0) > + { > + smallest_size = size; > + } > + } > + } > + > + closedir(dir); > + > + if (smallest_size == -1) > + { > + ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING, > + (errmsg("Could not find a valid hugepage size"), > + errhint("This error usually means that either CONFIG_HUGETLB_PAGE " > + "is not in kernel or that your architecture does not " > + "support hugepages or you did not configure hugepages"))); > + } > + > + return smallest_size; > +} > +#endif > + > /* > * PGSharedMemoryCreate > * > @@ -391,7 +556,17 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) > */ > #ifndef EXEC_BACKEND > { > +#ifdef MAP_HUGETLB > + long pagesize = 0; > + > + if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY) > + pagesize = InternalGetHugepageSize(); > + > + if (pagesize <= 0) > + pagesize = sysconf(_SC_PAGE_SIZE); > +#else > long pagesize = sysconf(_SC_PAGE_SIZE); > +#endif > > /* > * Ensure request size is a multiple of pagesize. > @@ -410,8 +585,22 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) > * to be false, we might need to add a run-time test here and do this > * only if the running kernel supports it. > */ > - AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, PG_MMAP_FLAGS, > - -1, 0); > + > + if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY) > + { > + AnonymousShmem = mmap(PG_HUGETLB_BASE_ADDR, size, PROT_READ|PROT_WRITE, > + PG_MMAP_FLAGS|PG_MAP_HUGETLB, -1, 0); > + > + elog(DEBUG3, "mmap() tried with MAP_HUGEPAGE: %p", AnonymousShmem); > + } > + > + if ((AnonymousShmem == MAP_FAILED && huge_tlb_pages == HUGE_TLB_TRY) > + || huge_tlb_pages == HUGE_TLB_OFF) > + { > + AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, PG_MMAP_FLAGS, > + -1, 0); > + } > + > if (AnonymousShmem == MAP_FAILED) > ereport(FATAL, > (errmsg("could not map anonymous shared memory: %m"), > diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c > index 745e7be..28b6191 100644 > --- a/src/backend/utils/misc/guc.c > +++ b/src/backend/utils/misc/guc.c > @@ -22,6 +22,7 @@ > #include <limits.h> > #include <unistd.h> > #include <sys/stat.h> > +#include <sys/mman.h> > #ifdef HAVE_SYSLOG > #include <syslog.h> > #endif > @@ -389,6 +390,22 @@ static const struct config_enum_entry synchronous_commit_options[] = { > }; > > /* > + * huge_tlb_pages may be on|off|try, where try is the default > + * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails > + * off: do not try tp mmap() with MAP_HUGETLB > + * try: try to mmap() with MAP_HUGETLB and fallback to mmap() > + * w/o MAP_HUGETLB > + */ > +static const struct config_enum_entry huge_tlb_options[] = { > +#ifdef MAP_HUGETLB > + {"on", HUGE_TLB_ON, false}, > + {"try", HUGE_TLB_TRY, false}, > +#endif > + {"off", HUGE_TLB_OFF, false}, > + {NULL, 0, false} > +}; > + > +/* > * Options for enum values stored in other modules > */ > extern const struct config_enum_entry wal_level_options[]; > @@ -447,6 +464,12 @@ int tcp_keepalives_idle; > int tcp_keepalives_interval; > int tcp_keepalives_count; > > +#ifdef MAP_HUGETLB > +int huge_tlb_pages = HUGE_TLB_TRY; > +#else > +int huge_tlb_pages = HUGE_TLB_OFF; > +#endif > + > /* > * These variables are all dummies that don't do anything, except in some > * cases provide the value for SHOW to display. The real state is elsewhere > @@ -3301,6 +3324,26 @@ static struct config_enum ConfigureNamesEnum[] = > NULL, NULL, NULL > }, > > + { > + {"huge_tlb_pages", > +#ifdef MAP_HUGETLB > + PGC_SUSET, > +#else > + PGC_INTERNAL, > +#endif > + RESOURCES_MEM, > + gettext_noop("Enable/disable the use of the hugepages feature"), > + NULL > + }, > + &huge_tlb_pages, > +#ifdef MAP_HUGETLB > + HUGE_TLB_TRY, > +#else > + HUGE_TLB_OFF, > +#endif > + huge_tlb_options, > + NULL, NULL, NULL > + }, > > /* End-of-list marker */ > { > diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample > index eeb9b82..e5bafec 100644 > --- a/src/backend/utils/misc/postgresql.conf.sample > +++ b/src/backend/utils/misc/postgresql.conf.sample > @@ -113,6 +113,7 @@ > > #shared_buffers = 32MB # min 128kB > # (change requires restart) > +#huge_tlb_pages = try # try to map memory with MAP_HUGETLB (on, off, try) > #temp_buffers = 8MB # min 800kB > #max_prepared_transactions = 0 # zero disables the feature > # (change requires restart) > diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h > index 06f797c..17f5870 100644 > --- a/src/include/utils/guc.h > +++ b/src/include/utils/guc.h > @@ -230,6 +230,24 @@ extern int tcp_keepalives_idle; > extern int tcp_keepalives_interval; > extern int tcp_keepalives_count; > > + > +/* > + * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY > + */ > +typedef enum > +{ > + HUGE_TLB_OFF, > + HUGE_TLB_ON, > + HUGE_TLB_TRY > +} HugeTlbType; > + > + > +/* > + * configure the use of huge TLB pages > + */ > +extern int huge_tlb_pages; > + > + > /* > * Functions exported by guc.c > */ -- Bruce Momjian <bruce@momjian.us> http://momjian.us EnterpriseDB http://enterprisedb.com + It's impossible for everything to be true. +
pgsql-hackers by date: